-
Notifications
You must be signed in to change notification settings - Fork 3
/
extract_datasets_for_torch1.py
58 lines (40 loc) · 1.34 KB
/
extract_datasets_for_torch1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import random
import numpy as np
from cs224d.data_utils import *
random.seed(1)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
num_to_word = {}
word_to_num = tokens
for (word, index) in tokens.items():
num_to_word[index] = word
word_to_num[word] = index
def extract(trainset, fn_words, fn_labels):
nTrain = len(trainset)
trainLabels = [None for _ in range(nTrain)]
trainSentences = [None for _ in range(nTrain)]
X_train = []
y_train = []
for i in xrange(nTrain):
trainWords, trainLabel = trainset[i]
trainWords = [word_to_num[word] for word in trainWords]
y_train.append(str(trainLabel + 1) + '\n')
X_train.append(' '.join([str(k) for k in trainWords]) + '\n')
with open(fn_words, 'w') as f:
f.writelines(X_train)
with open(fn_labels, 'w') as f:
f.writelines(y_train)
b = 1
pass
with open('inv_vocabulary_raw', 'w') as f1:
with open('vocabulary_raw', 'w') as f2:
lines1 = []
lines2 = []
for word, num in word_to_num.items():
lines1.append(word + ' ' + str(num+1) + '\n')
lines2.append(str(num+1) + ' ' + word + '\n')
f1.writelines(lines1)
f2.writelines(lines2)
extract(dataset.getTrainSentences(), 'x_train', 'y_train')
extract(dataset.getDevSentences(), 'x_dev', 'y_dev')