// (C) Copyright 2017, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Generating the training data: // If the format of the lstmf (ImageData) file changes, the training data will // have to be regenerated as follows: // // Use --xsize 800 for text2image to be similar to original training data. // // tesstrain.py --fonts_dir /usr/share/fonts --lang eng \ // --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \ // --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \ // --fontlist "Arial" --maxpages 10 // #include "lstm_test.h" namespace tesseract { // Tests that some simple networks can learn Arial and meet accuracy targets. TEST_F(LSTMTrainerTest, BasicTest) { // A Convolver sliding window classifier without LSTM. SetupTrainer( "[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 " "Ct1,1,64O1c1]", "no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false, 2e-4, false, "eng"); double non_lstm_err = TrainIterations(kTrainerIterations * 4); EXPECT_LT(non_lstm_err, 98); LOG(INFO) << "********** Expected < 98 ************\n"; // A basic single-layer, single direction LSTM. SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false); double lstm_uni_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_uni_err, 86); LOG(INFO) << "********** Expected < 86 ************\n"; // Beats the convolver. (Although it does have a lot more weights, it still // iterates faster.) EXPECT_LT(lstm_uni_err, non_lstm_err); } // Color learns almost as fast as normalized grey/2D. TEST_F(LSTMTrainerTest, ColorTest) { // A basic single-layer, single direction LSTM. SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2D-color-lstm", true, true); double lstm_uni_err = TrainIterations(kTrainerIterations); EXPECT_LT(lstm_uni_err, 85); // EXPECT_GT(lstm_uni_err, 66); LOG(INFO) << "********** Expected < 85 ************\n"; } TEST_F(LSTMTrainerTest, BidiTest) { // A basic single-layer, bi-di 1d LSTM. SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false); double lstm_bi_err = TrainIterations(kTrainerIterations); EXPECT_LT(lstm_bi_err, 75); LOG(INFO) << "********** Expected < 75 ************\n"; // Int mode training is dead, so convert the trained network to int and check // that its error rate is close to the float version. TestIntMode(kTrainerIterations); } // Tests that a 2d-2-layer network learns correctly. // It takes a lot of iterations to get there. TEST_F(LSTMTrainerTest, Test2D) { // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom. SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, false); double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2); EXPECT_LT(lstm_2d_err, 98); // EXPECT_GT(lstm_2d_err, 90); LOG(INFO) << "********** Expected < 98 ************\n"; // Int mode training is dead, so convert the trained network to int and check // that its error rate is close to the float version. TestIntMode(kTrainerIterations); } // Tests that a 2d-2-layer network with Adam does *a lot* better than // without it. TEST_F(LSTMTrainerTest, TestAdam) { // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom. SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, true); double lstm_2d_err = TrainIterations(kTrainerIterations); EXPECT_LT(lstm_2d_err, 70); LOG(INFO) << "********** Expected < 70 ************\n"; TestIntMode(kTrainerIterations); } // Trivial test of training speed on a fairly complex network. TEST_F(LSTMTrainerTest, SpeedTest) { SetupTrainerEng( "[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 " "O1c1]", "2-D-2-layer-lstm", false, true); TrainIterations(kTrainerIterations); LOG(INFO) << "********** *** ************\n"; } // Tests that two identical networks trained the same get the same results. // Also tests that the same happens with a serialize/deserialize in the middle. TEST_F(LSTMTrainerTest, DeterminismTest) { SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, false); double lstm_2d_err_a = TrainIterations(kTrainerIterations); double act_error_a = trainer_->ActivationError(); double char_error_a = trainer_->CharError(); std::vector trainer_a_data; EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, *trainer_, &trainer_a_data)); SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, false); double lstm_2d_err_b = TrainIterations(kTrainerIterations); double act_error_b = trainer_->ActivationError(); double char_error_b = trainer_->CharError(); EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b); EXPECT_FLOAT_EQ(act_error_a, act_error_b); EXPECT_FLOAT_EQ(char_error_a, char_error_b); // Now train some more iterations. lstm_2d_err_b = TrainIterations(kTrainerIterations / 3); act_error_b = trainer_->ActivationError(); char_error_b = trainer_->CharError(); // Unpack into a new trainer and train that some more too. SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, false); EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, *trainer_)); lstm_2d_err_a = TrainIterations(kTrainerIterations / 3); act_error_a = trainer_->ActivationError(); char_error_a = trainer_->CharError(); EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b); EXPECT_FLOAT_EQ(act_error_a, act_error_b); EXPECT_FLOAT_EQ(char_error_a, char_error_b); LOG(INFO) << "********** *** ************\n"; } // The baseline network against which to test the built-in softmax. TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) { // A basic single-layer, single direction LSTM. SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true); double lstm_uni_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_uni_err, 60); // EXPECT_GT(lstm_uni_err, 48); LOG(INFO) << "********** Expected < 60 ************\n"; // Check that it works in int mode too. TestIntMode(kTrainerIterations); // If we run TestIntMode again, it tests that int_mode networks can // serialize and deserialize correctly. double delta = TestIntMode(kTrainerIterations); // The two tests (both of int mode this time) should be almost identical. LOG(INFO) << "Delta in Int mode error rates = " << delta << "\n"; EXPECT_LT(delta, 0.01); } // Tests that the built-in softmax does better than the external one, // which has an error rate slightly less than 55%, as tested by // SoftmaxBaselineTest. TEST_F(LSTMTrainerTest, SoftmaxTest) { // LSTM with a built-in softmax can beat the external softmax. SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true); double lstm_sm_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_sm_err, 49.0); LOG(INFO) << "********** Expected < 49 ************\n"; // Check that it works in int mode too. TestIntMode(kTrainerIterations); } // Tests that the built-in encoded softmax does better than the external one. // It takes a lot of iterations to get there. TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) { // LSTM with a built-in encoded softmax can beat the external softmax. SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true); double lstm_sm_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_sm_err, 62.0); LOG(INFO) << "********** Expected < 62 ************\n"; // Check that it works in int mode too. TestIntMode(kTrainerIterations); } // Tests that layer access methods work correctly. TEST_F(LSTMTrainerTest, TestLayerAccess) { // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom. SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm", false, false); // Number of layers. const size_t kNumLayers = 8; // Expected layer names. const char *kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2", ":3:0", ":4:0", ":4:1:0", ":5"}; const char *kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL", "Maxpool", "Lfys32", "Lbx128LTR", "Lbx128", "Output"}; // Expected number of weights. const int kNumWeights[kNumLayers] = {0, 0, 16 * (25 + 1), 0, 32 * (4 * (32 + 16 + 1)), 128 * (4 * (128 + 32 + 1)), 128 * (4 * (128 + 32 + 1)), 112 * (2 * 128 + 1)}; auto layers = trainer_->EnumerateLayers(); EXPECT_EQ(kNumLayers, layers.size()); for (unsigned i = 0; i < kNumLayers && i < layers.size(); ++i) { EXPECT_STREQ(kLayerIds[i], layers[i].c_str()); EXPECT_STREQ(kLayerNames[i], trainer_->GetLayer(layers[i])->name().c_str()); EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights()); } } } // namespace tesseract.