-
Notifications
You must be signed in to change notification settings - Fork 97
/
data_utils.py
228 lines (180 loc) · 6.94 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
一些数据操作所需的模块
"""
import random
import numpy as np
import tensorflow as tf
from tensorflow.python.client import device_lib
VOCAB_SIZE_THRESHOLD_CPU = 50000
def _get_available_gpus():
"""获取当前可用GPU数量"""
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
def _get_embed_device(vocab_size):
"""Decide on which device to place an embed matrix given its vocab size.
根据输入输出的字典大小,选择在CPU还是GPU上初始化embedding向量
"""
gpus = _get_available_gpus()
if not gpus or vocab_size > VOCAB_SIZE_THRESHOLD_CPU:
return "/cpu:0"
return "/gpu:0"
def transform_sentence(q, ws_q, q_max):
x = ws_q.transform(q, max_len=q_max)
xl = len(q)
return x, xl
def transform_data(q, a, ws_q, ws_a, q_max, a_max):
"""转换数据
"""
x, xl = transform_sentence(q, ws_q, q_max)
y, yl = transform_sentence(a, ws_a, a_max)
return x, xl, y, yl
def batch_flow(x_data, y_data, ws_q, ws_a, batch_size):
"""从数据中随机 batch_size 个的数据,然后 yield 出去
"""
all_data = list(zip(x_data, y_data))
while True:
data_batch = random.sample(all_data, batch_size)
q_max = max([len(x[0]) for x in data_batch])
a_max = max([len(x[1]) for x in data_batch])
data_batch = sorted(data_batch, key=lambda x: len(x[1]), reverse=True)
x_batch = []
y_batch = []
xlen_batch = []
ylen_batch = []
for q, a in data_batch:
x, xl, y, yl = transform_data(
q, a, ws_q, ws_a, q_max, a_max
)
x_batch.append(x)
xlen_batch.append(xl)
y_batch.append(y)
ylen_batch.append(yl)
yield (
np.array(x_batch),
np.array(xlen_batch),
np.array(y_batch),
np.array(ylen_batch)
)
def batch_flow_bucket(x_data, y_data, ws_q, ws_a, batch_size, n_bucket=4):
"""从数据中随机 batch_size 个的数据,然后 yield 出去
一个 trick
相当于把不同数据的根据 target 句子的长度分组,算是一种 bucket
这里弄的比较简单,复杂一点的是把“相近长度”的输出聚合到一起
例如输出句子长度1~3的一组,4~6的一组
每个batch不会出现不同组的长度
"""
sizes = sorted(list(set([len(y) for y in y_data])))
if n_bucket > len(sizes):
n_bucket = len(sizes)
buckets = (np.linspace(0, 1, n_bucket, endpoint=False) * len(sizes)).astype(int)
buckets = [sizes[i] for i in buckets]
print('buckets', buckets)
sizes_data = {}
for i, k in enumerate(buckets):
if i > 0:
low = buckets[i - 1]
v = [(x, y) for x, y in zip(x_data, y_data)
if len(y) > low and len(y) <= k]
if len(v) >= batch_size:
sizes_data[k] = v
# while len(sizes_data[k]) < batch_size:
# sizes_data[k] = sizes_data[k] + sizes_data[k]
sizes = sorted(list(sizes_data.keys()))
print('sizes', buckets)
assert tuple(buckets[1:]) == tuple(sizes), \
'{} != {}'.format(buckets, sizes)
assert len(sizes) == n_bucket
while True:
size = random.choice(sizes)
data_batch = random.sample(sizes_data[size], batch_size)
q_max = max([len(x[0]) for x in data_batch])
a_max = max([len(x[1]) for x in data_batch])
data_batch = sorted(data_batch, key=lambda x: len(x[1]), reverse=True)
x_batch = []
y_batch = []
xlen_batch = []
ylen_batch = []
for q, a in data_batch:
x, xl, y, yl = transform_data(
q, a, ws_q, ws_a, q_max, a_max
)
x_batch.append(x)
xlen_batch.append(xl)
y_batch.append(y)
ylen_batch.append(yl)
yield (
np.array(x_batch),
np.array(xlen_batch),
np.array(y_batch),
np.array(ylen_batch)
)
def batch_flow_bucket_rl(p1_data, q1_data, p2_data, ws, batch_size, n_bucket=4):
"""从数据中随机 batch_size 个的数据,然后 yield 出去
一个 trick
相当于把不同数据的根据 target 句子的长度分组,算是一种 bucket
这里弄的比较简单,复杂一点的是把“相近长度”的输出聚合到一起
例如输出句子长度1~3的一组,4~6的一组
每个batch不会出现不同组的长度
"""
from data_utils import transform_sentence
from word_sequence import WordSequence
sizes = sorted(list(set([len(p2) for p2 in p2_data])))
buckets = (np.linspace(0, 1, n_bucket + 1) * len(sizes)).astype(int)
print('buckets', buckets)
sizes_data = {}
for i, k in enumerate(buckets):
if i > 0:
low = buckets[i - 1]
v = [(p1, q1, p2) for p1, q1, p2 in zip(p1_data, q1_data, p2_data)
if len(p2) > low and len(p2) < k]
sizes_data[k] = v
while len(sizes_data[k]) < batch_size:
sizes_data[k] = sizes_data[k] + sizes_data[k]
sizes = sorted(list(sizes_data.keys()))
# print('sizes', buckets)
assert tuple(buckets[1:]) == tuple(sizes), \
'{} != {}'.format(buckets, sizes)
assert len(sizes) == n_bucket
while True:
size = random.choice(sizes)
data_batch = random.sample(sizes_data[size], batch_size)
p1_max = max([len(x[0]) for x in data_batch])
q1_max = max([len(x[1]) for x in data_batch])
p1q1_max = max([len(x[0]) + len(x[1]) + 1 for x in data_batch])
p2_max = max([len(x[2]) for x in data_batch])
data_batch = sorted(data_batch, key=lambda x: len(x[2]), reverse=True)
p1_batch = []
q1_batch = []
p1q1_batch = []
p2_batch = []
p1_len_batch = []
q1_len_batch = []
p1q1_len_batch = []
p2_len_batch = []
for p1, q1, p2 in data_batch:
if p1[-1] in ('。', ',', '?', '!', '。', '…', '.', ',', '?', '!', '~'):
p1q1 = p1 + q1
else:
p1q1 = p1 + [','] + q1
x, xl = transform_sentence(p1, ws, p1_max)
p1_batch.append(x)
p1_len_batch.append(xl)
x, xl = transform_sentence(q1, ws, q1_max)
q1_batch.append(x)
q1_len_batch.append(xl)
x, xl = transform_sentence(p1q1, ws, p1q1_max)
p1q1_batch.append(x)
p1q1_len_batch.append(xl)
x, xl = transform_sentence(p2, ws, p2_max)
p2_batch.append(x)
p2_len_batch.append(xl)
yield (
np.array(p1_batch),
np.array(p1_len_batch),
np.array(q1_batch),
np.array(q1_len_batch),
np.array(p1q1_batch),
np.array(p1q1_len_batch),
np.array(p2_batch),
np.array(p2_len_batch),
)