forked from weinman/cnn_lstm_ctc_ocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mjsynth-tfrecord.py
202 lines (163 loc) · 7.55 KB
/
mjsynth-tfrecord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# CNN-LSTM-CTC-OCR
# Copyright (C) 2017 Jerod Weinman
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import tensorflow as tf
import math
import charset
"""Each record within the TFRecord file is a serialized Example proto.
The Example proto contains the following fields:
image/encoded: string containing JPEG encoded grayscale image
image/height: integer, image height in pixels
image/width: integer, image width in pixels
image/filename: string containing the basename of the image file
image/labels: list containing the sequence labels for the image text
image/text: string specifying the human-readable version of the text
"""
jpeg_data = tf.placeholder( dtype=tf.string )
jpeg_decoder = tf.image.decode_jpeg( jpeg_data,channels=1 )
kernel_sizes = [3,3,3,3,3,3] # CNN kernels for image reduction
# Minimum allowable width of image after CNN processing
min_width = 20
def calc_seq_len( image_width ):
"""Calculate sequence length of given image after CNN processing"""
conv1_trim = 2 * (kernel_sizes[0] // 2)
fc6_trim = 2 * (kernel_sizes[5] // 2)
after_conv1 = image_width - conv1_trim
after_pool2 = after_conv1 // 2
after_pool4 = after_pool2 - 1 # max without stride
after_pool6 = after_pool4 - 1
after_pool8 = after_pool6
seq_len = after_pool8
return seq_len
seq_lens = [calc_seq_len( w ) for w in range( 1024 )]
def gen_data( input_base_dir, image_list_filename, output_filebase,
num_shards=1000, start_shard=0 ):
""" Generate several shards worth of TFRecord data """
session_config = tf.ConfigProto()
session_config.gpu_options.allow_growth=True
sess = tf.Session( config=session_config )
image_filenames = get_image_filenames(
os.path.join( input_base_dir,
image_list_filename ) )
num_digits = math.ceil( math.log10( num_shards - 1 ) )
# Use appropriate # leading zeros
shard_format = '%0'+ ('%d'%num_digits) + 'd'
images_per_shard = int( math.ceil( len( image_filenames ) /
float( num_shards ) ) )
for i in range( start_shard,num_shards ):
start = i * images_per_shard
end = (i + 1) * images_per_shard
out_filename = output_filebase+'-'+(shard_format % i)+'.tfrecord'
if os.path.isfile( out_filename ): # Don't recreate data if restarting
continue
print str( i ), 'of', str( num_shards ),\
'[', str( start ), ':', str( end ), ']', out_filename
gen_shard( sess, input_base_dir,
image_filenames[start:end], out_filename )
# Clean up writing last shard
start = num_shards * images_per_shard
out_filename = output_filebase+'-'+(shard_format % num_shards)+'.tfrecord'
print str(i),'of',str(num_shards),'[',str(start),':]',out_filename
gen_shard(sess, input_base_dir, image_filenames[start:], out_filename)
sess.close()
def gen_shard( sess, input_base_dir, image_filenames, output_filename ):
"""Create a TFRecord file from a list of image filenames"""
writer = tf.python_io.TFRecordWriter( output_filename )
for filename in image_filenames:
path_filename = os.path.join( input_base_dir, filename )
if os.stat( path_filename ).st_size == 0:
print('SKIPPING', filename)
continue
try:
image_data, height, width = get_image( sess, path_filename )
text, labels = get_text_and_labels( filename )
if is_writable( width, text ):
example = make_example( filename, image_data,
labels, text,
height, width )
writer.write( example.SerializeToString() )
else:
print( 'SKIPPING', filename )
except:
# Some files have bogus payloads,
# catch and note the error, moving on
print( 'ERROR', filename )
writer.close()
def get_image_filenames( image_list_filename ):
""" Given input file, generate a list of relative filenames"""
filenames = []
with open( image_list_filename ) as f:
for line in f:
# Carve out the ground truth string and file path from lines like:
# ./2697/6/466_MONIKER_49537.jpg 49537
filename = line.split( ' ', 1 )[0][2:] # split off "./" and number
filenames.append( filename )
return filenames
def get_image( sess, filename ):
"""Given path to an image file, load its data and size"""
with tf.gfile.GFile( filename, 'rb' ) as f:
image_data = f.read()
image = sess.run( jpeg_decoder, feed_dict={ jpeg_data: image_data } )
height = image.shape[0]
width = image.shape[1]
return image_data, height, width
def is_writable( image_width, text ):
"""Determine whether the CNN-processed image is longer than the string"""
return (image_width > min_width) and (len( text ) <= seq_lens[image_width])
def get_text_and_labels( filename ):
"""
Extract the human-readable text and label sequence from image filename
"""
# Ground truth string lines embedded within base
# filename between underscores
# 2697/6/466_MONIKER_49537.jpg --> MONIKER
text = os.path.basename( filename ).split( '_', 2 )[1]
# Transform string text to sequence of indices using charset, e.g.,
# MONIKER -> [12, 14, 13, 8, 10, 4, 17]
labels = charset.string_to_label(text)
return text, labels
def make_example( filename, image_data, labels, text, height, width ):
"""Build an Example proto for an example.
Args:
filename: string, path to an image file, e.g., '/path/to/example.JPG'
image_data: string, JPEG encoding of grayscale image
labels: integer list, identifiers for the ground truth for the network
text: string, unique human-readable, e.g. 'dog'
height: integer, image height in pixels
width: integer, image width in pixels
Returns:
Example proto
"""
example = tf.train.Example( features=tf.train.Features( feature={
'image/encoded' : _bytes_feature( tf.compat.as_bytes( image_data ) ),
'image/labels' : _int64_feature( labels ),
'image/height' : _int64_feature( [height] ),
'image/width' : _int64_feature( [width] ),
'image/filename': _bytes_feature( tf.compat.as_bytes( filename ) ),
'text/string' : _bytes_feature( tf.compat.as_bytes( text ) ),
'text/length' : _int64_feature( [len( text )] )
}))
return example
def _int64_feature( values ):
return tf.train.Feature( int64_list=tf.train.Int64List( value=values ) )
def _bytes_feature( values ):
return tf.train.Feature( bytes_list=tf.train.BytesList( value=[values] ) )
def main( argv=None ):
gen_data( '../data/images', 'annotation_train.txt', '../data/train/words')
gen_data( '../data/images', 'annotation_val.txt', '../data/val/words' )
gen_data( '../data/images', 'annotation_test.txt', '../data/test/words' )
if __name__ == '__main__':
main()