forked from tesseract-ocr/tesseract
-
Notifications
You must be signed in to change notification settings - Fork 1
/
lstm_test.cc
214 lines (197 loc) · 9.51 KB
/
lstm_test.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Generating the training data:
// If the format of the lstmf (ImageData) file changes, the training data will
// have to be regenerated as follows:
//
// Use --xsize 800 for text2image to be similar to original training data.
//
// tesstrain.py --fonts_dir /usr/share/fonts --lang eng \
// --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \
// --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \
// --fontlist "Arial" --maxpages 10
//
#include "lstm_test.h"
namespace tesseract {
// Tests that some simple networks can learn Arial and meet accuracy targets.
TEST_F(LSTMTrainerTest, BasicTest) {
// A Convolver sliding window classifier without LSTM.
SetupTrainer(
"[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 "
"Ct1,1,64O1c1]",
"no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false, 2e-4, false, "eng");
double non_lstm_err = TrainIterations(kTrainerIterations * 4);
EXPECT_LT(non_lstm_err, 98);
LOG(INFO) << "********** Expected < 98 ************\n";
// A basic single-layer, single direction LSTM.
SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false);
double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
EXPECT_LT(lstm_uni_err, 86);
LOG(INFO) << "********** Expected < 86 ************\n";
// Beats the convolver. (Although it does have a lot more weights, it still
// iterates faster.)
EXPECT_LT(lstm_uni_err, non_lstm_err);
}
// Color learns almost as fast as normalized grey/2D.
TEST_F(LSTMTrainerTest, ColorTest) {
// A basic single-layer, single direction LSTM.
SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2D-color-lstm", true, true);
double lstm_uni_err = TrainIterations(kTrainerIterations);
EXPECT_LT(lstm_uni_err, 85);
// EXPECT_GT(lstm_uni_err, 66);
LOG(INFO) << "********** Expected < 85 ************\n";
}
TEST_F(LSTMTrainerTest, BidiTest) {
// A basic single-layer, bi-di 1d LSTM.
SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false);
double lstm_bi_err = TrainIterations(kTrainerIterations);
EXPECT_LT(lstm_bi_err, 75);
LOG(INFO) << "********** Expected < 75 ************\n";
// Int mode training is dead, so convert the trained network to int and check
// that its error rate is close to the float version.
TestIntMode(kTrainerIterations);
}
// Tests that a 2d-2-layer network learns correctly.
// It takes a lot of iterations to get there.
TEST_F(LSTMTrainerTest, Test2D) {
// A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
false);
double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);
EXPECT_LT(lstm_2d_err, 98);
// EXPECT_GT(lstm_2d_err, 90);
LOG(INFO) << "********** Expected < 98 ************\n";
// Int mode training is dead, so convert the trained network to int and check
// that its error rate is close to the float version.
TestIntMode(kTrainerIterations);
}
// Tests that a 2d-2-layer network with Adam does *a lot* better than
// without it.
TEST_F(LSTMTrainerTest, TestAdam) {
// A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
true);
double lstm_2d_err = TrainIterations(kTrainerIterations);
EXPECT_LT(lstm_2d_err, 70);
LOG(INFO) << "********** Expected < 70 ************\n";
TestIntMode(kTrainerIterations);
}
// Trivial test of training speed on a fairly complex network.
TEST_F(LSTMTrainerTest, SpeedTest) {
SetupTrainerEng(
"[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 "
"O1c1]",
"2-D-2-layer-lstm", false, true);
TrainIterations(kTrainerIterations);
LOG(INFO) << "********** *** ************\n";
}
// Tests that two identical networks trained the same get the same results.
// Also tests that the same happens with a serialize/deserialize in the middle.
TEST_F(LSTMTrainerTest, DeterminismTest) {
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
false);
double lstm_2d_err_a = TrainIterations(kTrainerIterations);
double act_error_a = trainer_->ActivationError();
double char_error_a = trainer_->CharError();
std::vector<char> trainer_a_data;
EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, *trainer_, &trainer_a_data));
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
false);
double lstm_2d_err_b = TrainIterations(kTrainerIterations);
double act_error_b = trainer_->ActivationError();
double char_error_b = trainer_->CharError();
EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
EXPECT_FLOAT_EQ(act_error_a, act_error_b);
EXPECT_FLOAT_EQ(char_error_a, char_error_b);
// Now train some more iterations.
lstm_2d_err_b = TrainIterations(kTrainerIterations / 3);
act_error_b = trainer_->ActivationError();
char_error_b = trainer_->CharError();
// Unpack into a new trainer and train that some more too.
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
false);
EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, *trainer_));
lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);
act_error_a = trainer_->ActivationError();
char_error_a = trainer_->CharError();
EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
EXPECT_FLOAT_EQ(act_error_a, act_error_b);
EXPECT_FLOAT_EQ(char_error_a, char_error_b);
LOG(INFO) << "********** *** ************\n";
}
// The baseline network against which to test the built-in softmax.
TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) {
// A basic single-layer, single direction LSTM.
SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true);
double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
EXPECT_LT(lstm_uni_err, 60);
// EXPECT_GT(lstm_uni_err, 48);
LOG(INFO) << "********** Expected < 60 ************\n";
// Check that it works in int mode too.
TestIntMode(kTrainerIterations);
// If we run TestIntMode again, it tests that int_mode networks can
// serialize and deserialize correctly.
double delta = TestIntMode(kTrainerIterations);
// The two tests (both of int mode this time) should be almost identical.
LOG(INFO) << "Delta in Int mode error rates = " << delta << "\n";
EXPECT_LT(delta, 0.01);
}
// Tests that the built-in softmax does better than the external one,
// which has an error rate slightly less than 55%, as tested by
// SoftmaxBaselineTest.
TEST_F(LSTMTrainerTest, SoftmaxTest) {
// LSTM with a built-in softmax can beat the external softmax.
SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true);
double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
EXPECT_LT(lstm_sm_err, 49.0);
LOG(INFO) << "********** Expected < 49 ************\n";
// Check that it works in int mode too.
TestIntMode(kTrainerIterations);
}
// Tests that the built-in encoded softmax does better than the external one.
// It takes a lot of iterations to get there.
TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) {
// LSTM with a built-in encoded softmax can beat the external softmax.
SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true);
double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
EXPECT_LT(lstm_sm_err, 62.0);
LOG(INFO) << "********** Expected < 62 ************\n";
// Check that it works in int mode too.
TestIntMode(kTrainerIterations);
}
// Tests that layer access methods work correctly.
TEST_F(LSTMTrainerTest, TestLayerAccess) {
// A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom.
SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm", false, false);
// Number of layers.
const size_t kNumLayers = 8;
// Expected layer names.
const char *kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2", ":3:0", ":4:0", ":4:1:0", ":5"};
const char *kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL", "Maxpool",
"Lfys32", "Lbx128LTR", "Lbx128", "Output"};
// Expected number of weights.
const int kNumWeights[kNumLayers] = {0,
0,
16 * (25 + 1),
0,
32 * (4 * (32 + 16 + 1)),
128 * (4 * (128 + 32 + 1)),
128 * (4 * (128 + 32 + 1)),
112 * (2 * 128 + 1)};
auto layers = trainer_->EnumerateLayers();
EXPECT_EQ(kNumLayers, layers.size());
for (unsigned i = 0; i < kNumLayers && i < layers.size(); ++i) {
EXPECT_STREQ(kLayerIds[i], layers[i].c_str());
EXPECT_STREQ(kLayerNames[i], trainer_->GetLayer(layers[i])->name().c_str());
EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights());
}
}
} // namespace tesseract.