Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
bubbajoe committed Mar 13, 2018
0 parents commit f932836
Show file tree
Hide file tree
Showing 20 changed files with 256,997 additions and 0 deletions.
16,823 changes: 16,823 additions & 0 deletions corpus/austen-emma.txt

Large diffs are not rendered by default.

6,793 changes: 6,793 additions & 0 deletions corpus/files-11-18/chesterton-thursday.txt

Large diffs are not rendered by default.

18,297 changes: 18,297 additions & 0 deletions corpus/files-11-18/edgeworth-parents.txt

Large diffs are not rendered by default.

3,286 changes: 3,286 additions & 0 deletions corpus/files-11-18/files15-18/files-17-18/shakespeare-macbeth.txt

Large diffs are not rendered by default.

17,435 changes: 17,435 additions & 0 deletions corpus/files-11-18/files15-18/files-17-18/whitman-leaves.txt

Large diffs are not rendered by default.

3,523 changes: 3,523 additions & 0 deletions corpus/files-11-18/files15-18/shakespeare-caesar.txt

Large diffs are not rendered by default.

4,922 changes: 4,922 additions & 0 deletions corpus/files-11-18/files15-18/shakespeare-hamlet.txt

Large diffs are not rendered by default.

22,924 changes: 22,924 additions & 0 deletions corpus/files-11-18/melville-moby-dick.txt

Large diffs are not rendered by default.

10,635 changes: 10,635 additions & 0 deletions corpus/files-11-18/milton-paradise.txt

Large diffs are not rendered by default.

8,471 changes: 8,471 additions & 0 deletions corpus/files-2-10/austen-persuasion.txt

Large diffs are not rendered by default.

14,796 changes: 14,796 additions & 0 deletions corpus/files-2-10/files-3-6/austen-sense.txt

Large diffs are not rendered by default.

99,805 changes: 99,805 additions & 0 deletions corpus/files-2-10/files-3-6/files-4-6/bible-kjv.txt

Large diffs are not rendered by default.

1,441 changes: 1,441 additions & 0 deletions corpus/files-2-10/files-3-6/files-4-6/blake-poems.txt

Large diffs are not rendered by default.

5,538 changes: 5,538 additions & 0 deletions corpus/files-2-10/files-3-6/files-4-6/bryant-stories.txt

Large diffs are not rendered by default.

1,671 changes: 1,671 additions & 0 deletions corpus/files-2-10/files-7-10/burgess-busterbrown.txt

Large diffs are not rendered by default.

3,331 changes: 3,331 additions & 0 deletions corpus/files-2-10/files-7-10/carroll-alice.txt

Large diffs are not rendered by default.

9,548 changes: 9,548 additions & 0 deletions corpus/files-2-10/files-7-10/chesterton-ball.txt

Large diffs are not rendered by default.

7,654 changes: 7,654 additions & 0 deletions corpus/files-2-10/files-7-10/chesterton-brown.txt

Large diffs are not rendered by default.

86 changes: 86 additions & 0 deletions indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# indexing.py - Be sure to run in python 3.6.3^
# Author: Joe Williams
# ReadMe - the first time though the program will take a
# while to process and serialize the data, but it wont after that
import pickle # For serializing data
import os.path # For checking whether a file exist
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
# For stemming and word tokenization
ps = PorterStemmer()

# Takes a file that has a list of files
def getInputFiles(filelist):
stream = open(filelist)
fileArray = stream.read().split("\n")
stream.close()
return fileArray

# Removes most special characters and caps
def preprocess(data):
data = data.lower()
for p in "!.,:@#$%^&?<>'*()[}{]-=;/\"\\\t\n":
if p in '\n;?:!.,.':
data = data.replace(p,' ')
else: data = data.replace(p,'')
return data

def createPositionalIndex(files):
index = {}
for i in range(len(files)):
f = open(files[i])
doc = preprocess(f.read()).split(' ')
for idx, word in enumerate(doc):
stemmed = ps.stem(word)
if not stemmed in index:
index[stemmed] = [(i,idx)]
else: index[stemmed].append((i,idx))
return index

def showPreview(positions,radius):
for doc_id, word_index in positions:
docArr = getInputFiles("input-files.txt")
with open(docArr[doc_id]) as f:
wordArr = preprocess(f.read()).split(' ')
result = ""
for word in wordArr[word_index-radius:word_index+radius]:
result += word + " "
print(result+" - "+doc_id+":"+ )

pi = {}
if os.path.isfile("index_data"):
print("Loading data...")
with open("index_data","rb") as f:
pi = pickle.load(f)
else:
print("Creating and serializing data for future use...")
files = getInputFiles("input-files.txt")
pi = createPositionalIndex(files)
with open("index_data","wb") as f:
pickle.dump(pi,f)

while 1:
print("Enter Query: 'Love her 4'")
q = input().lower().split(' ')
matches = []
if len(q) == 2:
word1, word2 = q
word1 = ps.stem(word1)
word2 = ps.stem(word2)
print(word1 + " " + word2)
for doc1, index1 in pi[word1]:
for doc2, index2 in pi[word2]:
if doc1 != doc2: continue
if index1 == (index2 - 1):
matches.append( (doc1,index1) )
showPreview(matches,5)
elif len(q) == 3:
word1, word2, length = q
for doc1, index1 in pi[word1]:
for doc2, index2 in pi[word2]:
if doc1 != doc2: continue
if abs(index1 - index2) <= length:
matches.append( (doc1,index1) )
showPreview(matches,5)
elif q[0] == 'exit': exit()
else: print("Needs to have 2 or 3 args")
18 changes: 18 additions & 0 deletions input-files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
./corpus/austen-emma.txt
./corpus/files-11-18/chesterton-thursday.txt
./corpus/files-11-18/edgeworth-parents.txt
./corpus/files-11-18/files15-18/files-17-18/shakespeare-macbeth.txt
./corpus/files-11-18/files15-18/files-17-18/whitman-leaves.txt
./corpus/files-11-18/files15-18/shakespeare-caesar.txt
./corpus/files-11-18/files15-18/shakespeare-hamlet.txt
./corpus/files-11-18/melville-moby-dick.txt
./corpus/files-11-18/milton-paradise.txt
./corpus/files-2-10/austen-persuasion.txt
./corpus/files-2-10/files-3-6/austen-sense.txt
./corpus/files-2-10/files-3-6/files-4-6/bible-kjv.txt
./corpus/files-2-10/files-3-6/files-4-6/blake-poems.txt
./corpus/files-2-10/files-3-6/files-4-6/bryant-stories.txt
./corpus/files-2-10/files-7-10/burgess-busterbrown.txt
./corpus/files-2-10/files-7-10/carroll-alice.txt
./corpus/files-2-10/files-7-10/chesterton-ball.txt
./corpus/files-2-10/files-7-10/chesterton-brown.txt

0 comments on commit f932836

Please sign in to comment.