mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
200 lines
8.5 KiB
C++
200 lines
8.5 KiB
C++
|
|
||
|
// Generating the training data:
|
||
|
// If the format of the lstmf (ImageData) file changes, the training data will
|
||
|
// have to be regenerated as follows:
|
||
|
// ./tesseract/text2image --xsize=800 --font=Arial \
|
||
|
// --text=tesseract/testdata/lstm_training.txt --leading=32 \
|
||
|
// --outputbase=tesseract/testdata/lstm_training.arial
|
||
|
// ./tesseract tesseract/testdata/lstm_training.arial.tif \
|
||
|
// tesseract/testdata/lstm_training.arial lstm.train \
|
||
|
// --pageseg_mode=6
|
||
|
|
||
|
#include "tesseract/unittest/lstm_test.h"
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
// Tests that some simple networks can learn Arial and meet accuracy targets.
|
||
|
TEST_F(LSTMTrainerTest, BasicTest) {
|
||
|
// A Convolver sliding window classifier without LSTM.
|
||
|
SetupTrainer(
|
||
|
"[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 "
|
||
|
"Ct1,1,64O1c1]",
|
||
|
"no-lstm", "eng.unicharset", "lstm_training.arial.lstmf", false, false,
|
||
|
2e-4, false);
|
||
|
double non_lstm_err = TrainIterations(kTrainerIterations * 3 / 2);
|
||
|
EXPECT_LT(non_lstm_err, 98);
|
||
|
|
||
|
// A basic single-layer, single direction LSTM.
|
||
|
SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false);
|
||
|
double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
|
||
|
EXPECT_LT(lstm_uni_err, 86);
|
||
|
// Beats the convolver. (Although it does have a lot more weights, it still
|
||
|
// iterates faster.)
|
||
|
EXPECT_LT(lstm_uni_err, non_lstm_err);
|
||
|
}
|
||
|
|
||
|
// Color learns almost as fast as normalized grey/2D.
|
||
|
TEST_F(LSTMTrainerTest, ColorTest) {
|
||
|
// A basic single-layer, single direction LSTM.
|
||
|
SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||
|
"2D-color-lstm", true, true);
|
||
|
double lstm_uni_err = TrainIterations(kTrainerIterations);
|
||
|
EXPECT_LT(lstm_uni_err, 85);
|
||
|
EXPECT_GT(lstm_uni_err, 66);
|
||
|
}
|
||
|
|
||
|
TEST_F(LSTMTrainerTest, BidiTest) {
|
||
|
// A basic single-layer, bi-di 1d LSTM.
|
||
|
SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false);
|
||
|
double lstm_bi_err = TrainIterations(kTrainerIterations);
|
||
|
EXPECT_LT(lstm_bi_err, 75);
|
||
|
|
||
|
// Int mode training is dead, so convert the trained network to int and check
|
||
|
// that its error rate is close to the float version.
|
||
|
TestIntMode(kTrainerIterations);
|
||
|
}
|
||
|
|
||
|
// Tests that a 2d-2-layer network learns correctly.
|
||
|
// It takes a lot of iterations to get there.
|
||
|
TEST_F(LSTMTrainerTest, Test2D) {
|
||
|
// A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
|
||
|
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||
|
"2-D-2-layer-lstm", false, false);
|
||
|
double lstm_2d_err = TrainIterations(kTrainerIterations);
|
||
|
EXPECT_LT(lstm_2d_err, 98);
|
||
|
EXPECT_GT(lstm_2d_err, 90);
|
||
|
// Int mode training is dead, so convert the trained network to int and check
|
||
|
// that its error rate is close to the float version.
|
||
|
TestIntMode(kTrainerIterations);
|
||
|
}
|
||
|
|
||
|
// Tests that a 2d-2-layer network with Adam does *a lot* better than
|
||
|
// without it.
|
||
|
TEST_F(LSTMTrainerTest, TestAdam) {
|
||
|
// A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
|
||
|
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||
|
"2-D-2-layer-lstm", false, true);
|
||
|
double lstm_2d_err = TrainIterations(kTrainerIterations);
|
||
|
EXPECT_LT(lstm_2d_err, 70);
|
||
|
TestIntMode(kTrainerIterations);
|
||
|
}
|
||
|
|
||
|
// Trivial test of training speed on a fairly complex network.
|
||
|
TEST_F(LSTMTrainerTest, SpeedTest) {
|
||
|
SetupTrainerEng(
|
||
|
"[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 "
|
||
|
"O1c1]",
|
||
|
"2-D-2-layer-lstm", false, true);
|
||
|
TrainIterations(kTrainerIterations);
|
||
|
}
|
||
|
|
||
|
// Tests that two identical networks trained the same get the same results.
|
||
|
// Also tests that the same happens with a serialize/deserialize in the middle.
|
||
|
TEST_F(LSTMTrainerTest, DeterminismTest) {
|
||
|
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||
|
"2-D-2-layer-lstm", false, false);
|
||
|
double lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);
|
||
|
double act_error_a = trainer_->ActivationError();
|
||
|
double char_error_a = trainer_->CharError();
|
||
|
GenericVector<char> trainer_a_data;
|
||
|
EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
|
||
|
&trainer_a_data));
|
||
|
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||
|
"2-D-2-layer-lstm", false, false);
|
||
|
double lstm_2d_err_b = TrainIterations(kTrainerIterations / 3);
|
||
|
double act_error_b = trainer_->ActivationError();
|
||
|
double char_error_b = trainer_->CharError();
|
||
|
EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
|
||
|
EXPECT_FLOAT_EQ(act_error_a, act_error_b);
|
||
|
EXPECT_FLOAT_EQ(char_error_a, char_error_b);
|
||
|
// Now train some more iterations.
|
||
|
lstm_2d_err_b = TrainIterations(kTrainerIterations / 3);
|
||
|
act_error_b = trainer_->ActivationError();
|
||
|
char_error_b = trainer_->CharError();
|
||
|
// Unpack into a new trainer and train that some more too.
|
||
|
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||
|
"2-D-2-layer-lstm", false, false);
|
||
|
EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, trainer_.get()));
|
||
|
lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);
|
||
|
act_error_a = trainer_->ActivationError();
|
||
|
char_error_a = trainer_->CharError();
|
||
|
EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
|
||
|
EXPECT_FLOAT_EQ(act_error_a, act_error_b);
|
||
|
EXPECT_FLOAT_EQ(char_error_a, char_error_b);
|
||
|
}
|
||
|
|
||
|
// The baseline network against which to test the built-in softmax.
|
||
|
TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) {
|
||
|
// A basic single-layer, single direction LSTM.
|
||
|
SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true);
|
||
|
double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
|
||
|
EXPECT_LT(lstm_uni_err, 60);
|
||
|
EXPECT_GT(lstm_uni_err, 48);
|
||
|
// Check that it works in int mode too.
|
||
|
TestIntMode(kTrainerIterations);
|
||
|
// If we run TestIntMode again, it tests that int_mode networks can
|
||
|
// serialize and deserialize correctly.
|
||
|
double delta = TestIntMode(kTrainerIterations);
|
||
|
// The two tests (both of int mode this time) should be almost identical.
|
||
|
LOG(INFO) << "Delta in Int mode error rates = " << delta;
|
||
|
EXPECT_LT(delta, 0.01);
|
||
|
}
|
||
|
|
||
|
// Tests that the built-in softmax does better than the external one,
|
||
|
// which has an error rate slightly less than 55%, as tested by
|
||
|
// SoftmaxBaselineTest.
|
||
|
TEST_F(LSTMTrainerTest, SoftmaxTest) {
|
||
|
// LSTM with a built-in softmax can beat the external softmax.
|
||
|
SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true);
|
||
|
double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
|
||
|
EXPECT_LT(lstm_sm_err, 49.0);
|
||
|
// Check that it works in int mode too.
|
||
|
TestIntMode(kTrainerIterations);
|
||
|
}
|
||
|
|
||
|
// Tests that the built-in encoded softmax does better than the external one.
|
||
|
// It takes a lot of iterations to get there.
|
||
|
TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) {
|
||
|
// LSTM with a built-in encoded softmax can beat the external softmax.
|
||
|
SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true);
|
||
|
double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
|
||
|
EXPECT_LT(lstm_sm_err, 62.0);
|
||
|
// Check that it works in int mode too.
|
||
|
TestIntMode(kTrainerIterations);
|
||
|
}
|
||
|
|
||
|
// Tests that layer access methods work correctly.
|
||
|
TEST_F(LSTMTrainerTest, TestLayerAccess) {
|
||
|
// A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom.
|
||
|
SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm",
|
||
|
false, false);
|
||
|
// Number of layers.
|
||
|
const int kNumLayers = 8;
|
||
|
// Expected layer names.
|
||
|
const char* kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2",
|
||
|
":3:0", ":4:0", ":4:1:0", ":5"};
|
||
|
const char* kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL",
|
||
|
"Maxpool", "Lfys32", "Lbx128LTR",
|
||
|
"Lbx128", "Output"};
|
||
|
// Expected number of weights.
|
||
|
const int kNumWeights[kNumLayers] = {0,
|
||
|
0,
|
||
|
16 * (25 + 1),
|
||
|
0,
|
||
|
32 * (4 * (32 + 16 + 1)),
|
||
|
128 * (4 * (128 + 32 + 1)),
|
||
|
128 * (4 * (128 + 32 + 1)),
|
||
|
112 * (2 * 128 + 1)};
|
||
|
|
||
|
GenericVector<STRING> layers = trainer_->EnumerateLayers();
|
||
|
EXPECT_EQ(kNumLayers, layers.size());
|
||
|
for (int i = 0; i < kNumLayers && i < layers.size(); ++i) {
|
||
|
EXPECT_STREQ(kLayerIds[i], layers[i].string());
|
||
|
EXPECT_STREQ(kLayerNames[i],
|
||
|
trainer_->GetLayer(layers[i])->name().string());
|
||
|
EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
} // namespace tesseract.
|