-
Notifications
You must be signed in to change notification settings - Fork 2
/
WordVectorizer.java
242 lines (127 loc) · 6.34 KB
/
WordVectorizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
package arkangel;
import java.io.*;
import java.util.*;
//following interface defines method signatures which every text-to-vector class/app needs to implement, albeit
//they might wanna do it in different ways.
//Text cleaning, forming a bag of words ( essentially a vocab of words with corresponding frequency) and converting
//text to a vector based on the vocab are all methods that every text classification needs to implement in different
//depending on requirements, so we keep them in an interface
interface text2Matrix {
String TextCleaner(String line);
void bagOfWordsCreator(String line);
ArrayList<Integer> wordVectorizer(String line);
}
public class WordVectorizer implements text2Matrix {
public ArrayList<Integer> label_array = new ArrayList<Integer>();//1-d array storing pos/neg labels for testing
public ArrayList<Integer> test_label_array = new ArrayList<Integer>();
public ArrayList<ArrayList<Integer>> test_matrix = new ArrayList<ArrayList<Integer>>();
public ArrayList<ArrayList<Integer>> input_matrix = new ArrayList<ArrayList<Integer>>(); //forming a matrix of word vecs
ArrayList<String> wordsList = new ArrayList<String>(); //storing all the words
ArrayList<String> stopWords = new ArrayList<String>(); //storing stop words
public Set<String> bagOfWords = new HashSet<String>(); //storing unqiue instances of
//words for vocab, we used set
//lines and test_lines store positive and negative tweets respectively
static ArrayList<String> lines = new ArrayList<String>();
static ArrayList<String> test_lines = new ArrayList<String>();
//simply reads in a file of data line by line, cleans and tokenizes each line, creates a bag of words model
//and uses the former to make a vector for each line so that it can be fed to a neural net
void trainDataReader(String train_path) throws IOException {
stopWordsReader(stopWords, "C:\\Users\\zakir\\Desktop\\stopwords.txt");
File csv = new File(train_path); //creating File object to hold .csv training data
FileReader fr = new FileReader(csv); //FileNotoundException thrown to deal with exception here
BufferedReader bfr = new BufferedReader(fr);
try {
System.out.println("TWEETS\n");
for(String line; (line=bfr.readLine())!= null;) {
lines.add(TextCleaner(line.split(",")[5].toLowerCase()));
//input_matrix.add(wordVectorizer(TextCleaner(line.split(",")[5].toLowerCase())));
label_array.add(Integer.parseInt(line.split(",")[0]));
bagOfWordsCreator(TextCleaner(line.split(",")[5].toLowerCase()));
System.out.println(TextCleaner(line.split(",")[5].toLowerCase()));
}
wordRemover(); //calling this method here to remove unnecessary words from vocab
for(String l: lines) {
input_matrix.add(wordVectorizer(l));
}
}
catch(IOException ioe){
System.err.println("Unexpected issue while reading:\n\n" + ioe);
}
catch(Exception e){
System.err.println("Unexpected general issue:\n\n" + e);
}
//closing the filereader and bufferedreader objects in finally block
finally {
try{
fr.close();
bfr.close(); }
catch(Exception e) {
System.err.println("Exception while closing: " + e);
}
}
} //trainingDataReader method ends
void testDataReader(String test_path)throws IOException {
File csv = new File(test_path);
FileReader fr = new FileReader(csv);
BufferedReader bfr = new BufferedReader(fr);
System.out.println("TWEETS\n");
for(String line; (line=bfr.readLine())!= null;) {
test_lines.add(TextCleaner(line.split(",")[5].toLowerCase()));
test_label_array.add(Integer.parseInt(line.split(",")[0]));
System.out.println(TextCleaner(line.split(",")[5].toLowerCase()));
}
for(String l: test_lines) {
test_matrix.add(wordVectorizer(l));
}
} //testDataReader method ends
//method below to remove RT(retweet symbol), https links and other unnecessary characters
//including html encodings(quot; or amp;)from the tweet to clean it
public String TextCleaner(String text) {
return text.replaceAll("[!#$%^&*?,.;?\"0-9/;():-]", "").replace("RT", "")
.replaceAll("http.*?\\s", "").replaceAll("@.*?\\s", "").replaceAll("www.*?\\s", "")
.replace("quot", "").replace("amp", "");
}
//splits a line of text into tokens, adds them to the Set type field bagOfWords to obtain a vocabulary of unique
//words, and also adds them to ArrayList type field wordsList to obtain frequency of each word in the vocabulary
public void bagOfWordsCreator(String line){
String[] tokens = line.trim().split("\\s+");
for(int i=0; i < tokens.length; i++) {
bagOfWords.add(tokens[i]);
wordsList.add(tokens[i]);
}
}
//this removes all the stopwords and highly infrequent words
public void wordRemover() {
for(int i=0; i < wordsList.size(); i++) {
if((Collections.frequency(wordsList, wordsList.get(i)) <= 2) || wordsList.get(i).equals(" ")==true) {
bagOfWords.remove(wordsList.get(i));
}
}
bagOfWords.removeAll(stopWords);
}
//following method reads all the "stopwords" i.e. words that add no meaning to a sentence like "a" or "the",
//and stores them in a the List type field variable stopWords.
static void stopWordsReader(ArrayList<String> ar, String path) throws IOException {
File file = new File(path);
FileReader fr = new FileReader(file);
BufferedReader bfr = new BufferedReader(fr);
String stopWords = "";
while((stopWords=bfr.readLine()) != null) {
ar.add(stopWords);
}
bfr.close();
}
//following method takes in a line of text and converts it into a vector for feeding into a neural net. We have chosen
//to form a binary vector i.e. it consists of either 1 (indicating presence of a word in vocab) or 0 (indicating absence)
public ArrayList<Integer> wordVectorizer(String line) {
String[] tweet = line.split("\\s+");
//initializes a vector with length of vocabulary and populates it with zeroes
ArrayList<Integer> vector = new ArrayList<Integer>(Collections.nCopies(bagOfWords.size(),0));
for(int i=0; i < tweet.length; i++) {
if(bagOfWords.contains(tweet[i])) {
vector.set(new ArrayList<String>(bagOfWords).indexOf(tweet[i]), 1 );
}
}
return vector;
}
}