tesseract/unittest/lang_model_test.cc

// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <string> // for std::string

#include "gmock/gmock.h" // for testing::ElementsAreArray

#include "include_gunit.h"
#include "lang_model_helpers.h"
#include "log.h" // for LOG
#include "lstmtrainer.h"
#include "unicharset_training_utils.h"

namespace tesseract {

std::string TestDataNameToPath(const std::string &name) {
  return file::JoinPath(TESTING_DIR, name);
}

// This is an integration test that verifies that CombineLangModel works to
// the extent that an LSTMTrainer can be initialized with the result, and it
// can encode strings. More importantly, the test verifies that adding an extra
// character to the unicharset does not change the encoding of strings.
TEST(LangModelTest, AddACharacter) {
  constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
  constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
  // Setup the arguments.
  std::string script_dir = LANGDATA_DIR;
  std::string eng_dir = file::JoinPath(script_dir, "eng");
  std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
  UNICHARSET unicharset;
  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
  std::string version_str = "TestVersion";
  file::MakeTmpdir();
  std::string output_dir = FLAGS_test_tmpdir;
  LOG(INFO) << "Output dir=" << output_dir << "\n";
  std::string lang1 = "eng";
  bool pass_through_recoder = false;
  // If these reads fail, we get a warning message and an empty list of words.
  std::vector<std::string> words = split(ReadFile(file::JoinPath(eng_dir, "eng.wordlist")), '\n');
  EXPECT_GT(words.size(), 0);
  std::vector<std::string> puncs = split(ReadFile(file::JoinPath(eng_dir, "eng.punc")), '\n');
  EXPECT_GT(puncs.size(), 0);
  std::vector<std::string> numbers = split(ReadFile(file::JoinPath(eng_dir, "eng.numbers")), '\n');
  EXPECT_GT(numbers.size(), 0);
  bool lang_is_rtl = false;
  // Generate the traineddata file.
  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
                                pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,
                                nullptr));
  // Init a trainer with it, and encode kTestString.
  std::string traineddata1 = file::JoinPath(output_dir, lang1, lang1) + ".traineddata";
  LSTMTrainer trainer1;
  trainer1.InitCharSet(traineddata1);
  std::vector<int> labels1;
  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
  std::string test1_decoded = trainer1.DecodeLabels(labels1);
  std::string test1_str(&test1_decoded[0], test1_decoded.length());
  LOG(INFO) << "Labels1=" << test1_str << "\n";

  // Add a new character to the unicharset and try again.
  int size_before = unicharset.size();
  unicharset.unichar_insert("₹");
  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);
  EXPECT_EQ(size_before + 1, unicharset.size());
  // Generate the traineddata file.
  std::string lang2 = "extended";
  EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
                                           pass_through_recoder, words, puncs, numbers, lang_is_rtl,
                                           nullptr, nullptr));
  // Init a trainer with it, and encode kTestString.
  std::string traineddata2 = file::JoinPath(output_dir, lang2, lang2) + ".traineddata";
  LSTMTrainer trainer2;
  trainer2.InitCharSet(traineddata2);
  std::vector<int> labels2;
  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
  std::string test2_decoded = trainer2.DecodeLabels(labels2);
  std::string test2_str(&test2_decoded[0], test2_decoded.length());
  LOG(INFO) << "Labels2=" << test2_str << "\n";
  // encode kTestStringRupees.
  std::vector<int> labels3;
  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
  std::string test3_decoded = trainer2.DecodeLabels(labels3);
  std::string test3_str(&test3_decoded[0], test3_decoded.length());
  LOG(INFO) << "labels3=" << test3_str << "\n";
  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
  // Since Tensor Flow's CTC implementation insists on having the null be the
  // last label, and we want to be compatible, null has to be renumbered when
  // we add a class.
  int null1 = trainer1.null_char();
  int null2 = trainer2.null_char();
  EXPECT_EQ(null1 + 1, null2);
  std::vector<int> labels1_v(labels1.size());
  for (unsigned i = 0; i < labels1.size(); ++i) {
    if (labels1[i] == null1) {
      labels1_v[i] = null2;
    } else {
      labels1_v[i] = labels1[i];
    }
  }
  EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
  // To make sure we we are not cheating somehow, we can now encode the Rupee
  // symbol, which we could not do before.
  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
}

// Same as above test, for hin instead of eng
TEST(LangModelTest, AddACharacterHindi) {
  constexpr char kTestString[] = "हिन्दी में एक लाइन लिखें";
  constexpr char kTestStringRupees[] = "हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००";
  // Setup the arguments.
  std::string script_dir = LANGDATA_DIR;
  std::string hin_dir = file::JoinPath(script_dir, "hin");
  std::string unicharset_path = TestDataNameToPath("hin_beam.unicharset");
  UNICHARSET unicharset;
  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
  std::string version_str = "TestVersion";
  file::MakeTmpdir();
  std::string output_dir = FLAGS_test_tmpdir;
  LOG(INFO) << "Output dir=" << output_dir << "\n";
  std::string lang1 = "hin";
  bool pass_through_recoder = false;
  // If these reads fail, we get a warning message and an empty list of words.
  std::vector<std::string> words = split(ReadFile(file::JoinPath(hin_dir, "hin.wordlist")), '\n');
  EXPECT_GT(words.size(), 0);
  std::vector<std::string> puncs = split(ReadFile(file::JoinPath(hin_dir, "hin.punc")), '\n');
  EXPECT_GT(puncs.size(), 0);
  std::vector<std::string> numbers = split(ReadFile(file::JoinPath(hin_dir, "hin.numbers")), '\n');
  EXPECT_GT(numbers.size(), 0);
  bool lang_is_rtl = false;
  // Generate the traineddata file.
  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
                                pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,
                                nullptr));
  // Init a trainer with it, and encode kTestString.
  std::string traineddata1 = file::JoinPath(output_dir, lang1, lang1) + ".traineddata";
  LSTMTrainer trainer1;
  trainer1.InitCharSet(traineddata1);
  std::vector<int> labels1;
  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
  std::string test1_decoded = trainer1.DecodeLabels(labels1);
  std::string test1_str(&test1_decoded[0], test1_decoded.length());
  LOG(INFO) << "Labels1=" << test1_str << "\n";

  // Add a new character to the unicharset and try again.
  int size_before = unicharset.size();
  unicharset.unichar_insert("₹");
  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);
  EXPECT_EQ(size_before + 1, unicharset.size());
  // Generate the traineddata file.
  std::string lang2 = "extendedhin";
  EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
                                           pass_through_recoder, words, puncs, numbers, lang_is_rtl,
                                           nullptr, nullptr));
  // Init a trainer with it, and encode kTestString.
  std::string traineddata2 = file::JoinPath(output_dir, lang2, lang2) + ".traineddata";
  LSTMTrainer trainer2;
  trainer2.InitCharSet(traineddata2);
  std::vector<int> labels2;
  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
  std::string test2_decoded = trainer2.DecodeLabels(labels2);
  std::string test2_str(&test2_decoded[0], test2_decoded.length());
  LOG(INFO) << "Labels2=" << test2_str << "\n";
  // encode kTestStringRupees.
  std::vector<int> labels3;
  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
  std::string test3_decoded = trainer2.DecodeLabels(labels3);
  std::string test3_str(&test3_decoded[0], test3_decoded.length());
  LOG(INFO) << "labels3=" << test3_str << "\n";
  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
  // Since Tensor Flow's CTC implementation insists on having the null be the
  // last label, and we want to be compatible, null has to be renumbered when
  // we add a class.
  int null1 = trainer1.null_char();
  int null2 = trainer2.null_char();
  EXPECT_EQ(null1 + 1, null2);
  std::vector<int> labels1_v(labels1.size());
  for (unsigned i = 0; i < labels1.size(); ++i) {
    if (labels1[i] == null1) {
      labels1_v[i] = null2;
    } else {
      labels1_v[i] = labels1[i];
    }
  }
  EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
  // To make sure we we are not cheating somehow, we can now encode the Rupee
  // symbol, which we could not do before.
  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
}

} // namespace tesseract
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								// (C) Copyright 2017, Google Inc.
 								// Licensed under the Apache License, Version 2.0 (the "License");
 								// you may not use this file except in compliance with the License.
 								// You may obtain a copy of the License at
 								// http://www.apache.org/licenses/LICENSE-2.0
 								// Unless required by applicable law or agreed to in writing, software
 								// distributed under the License is distributed on an "AS IS" BASIS,
 								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								// See the License for the specific language governing permissions and
 								// limitations under the License.
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								#include <string> // for std::string
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								#include "gmock/gmock.h" // for testing::ElementsAreArray
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
 								#include "include_gunit.h"
 								#include "lang_model_helpers.h"
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								#include "log.h" // for LOG
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								#include "lstmtrainer.h"
 								#include "unicharset_training_utils.h"
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
 								namespace tesseract {
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								std::string TestDataNameToPath(const std::string &name) {
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  return file::JoinPath(TESTING_DIR, name);
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								}
 								// This is an integration test that verifies that CombineLangModel works to
 								// the extent that an LSTMTrainer can be initialized with the result, and it
 								// can encode strings. More importantly, the test verifies that adding an extra
 								// character to the unicharset does not change the encoding of strings.
 								TEST(LangModelTest, AddACharacter) {
 								  constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
 								  constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
 								  // Setup the arguments.
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  std::string script_dir = LANGDATA_DIR;
 								  std::string eng_dir = file::JoinPath(script_dir, "eng");
 								  std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  UNICHARSET unicharset;
 								  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  std::string version_str = "TestVersion";
-												Make tmp directory for all unit tests

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2020-12-31 01:17:58 +08:00
+								  file::MakeTmpdir();
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  std::string output_dir = FLAGS_test_tmpdir;
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  LOG(INFO) << "Output dir=" << output_dir << "\n";
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  std::string lang1 = "eng";
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  bool pass_through_recoder = false;
 								  // If these reads fail, we get a warning message and an empty list of words.
-												Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 03:36:20 +08:00
+								  std::vector<std::string> words = split(ReadFile(file::JoinPath(eng_dir, "eng.wordlist")), '\n');
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  EXPECT_GT(words.size(), 0);
-												Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 03:36:20 +08:00
+								  std::vector<std::string> puncs = split(ReadFile(file::JoinPath(eng_dir, "eng.punc")), '\n');
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  EXPECT_GT(puncs.size(), 0);
-												Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 03:36:20 +08:00
+								  std::vector<std::string> numbers = split(ReadFile(file::JoinPath(eng_dir, "eng.numbers")), '\n');
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  EXPECT_GT(numbers.size(), 0);
 								  bool lang_is_rtl = false;
 								  // Generate the traineddata file.
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
 								                                pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,
 								                                nullptr));
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  // Init a trainer with it, and encode kTestString.
-												unittest: Remove dependency on absl::StrCat()

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-08-06 17:38:05 +08:00
+								  std::string traineddata1 = file::JoinPath(output_dir, lang1, lang1) + ".traineddata";
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  LSTMTrainer trainer1;
 								  trainer1.InitCharSet(traineddata1);
-												More std::vector.

											
										
										
											2021-01-07 18:57:49 +08:00
+								  std::vector<int> labels1;
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
-												Replace remaining STRING by std::string in unittest

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 05:52:52 +08:00
+								  std::string test1_decoded = trainer1.DecodeLabels(labels1);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string test1_str(&test1_decoded[0], test1_decoded.length());
 								  LOG(INFO) << "Labels1=" << test1_str << "\n";
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
 								  // Add a new character to the unicharset and try again.
 								  int size_before = unicharset.size();
 								  unicharset.unichar_insert("₹");
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  EXPECT_EQ(size_before + 1, unicharset.size());
 								  // Generate the traineddata file.
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  std::string lang2 = "extended";
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
 								                                           pass_through_recoder, words, puncs, numbers, lang_is_rtl,
 								                                           nullptr, nullptr));
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  // Init a trainer with it, and encode kTestString.
-												unittest: Remove dependency on absl::StrCat()

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-08-06 17:38:05 +08:00
+								  std::string traineddata2 = file::JoinPath(output_dir, lang2, lang2) + ".traineddata";
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  LSTMTrainer trainer2;
 								  trainer2.InitCharSet(traineddata2);
-												More std::vector.

											
										
										
											2021-01-07 18:57:49 +08:00
+								  std::vector<int> labels2;
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
-												Replace remaining STRING by std::string in unittest

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 05:52:52 +08:00
+								  std::string test2_decoded = trainer2.DecodeLabels(labels2);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string test2_str(&test2_decoded[0], test2_decoded.length());
 								  LOG(INFO) << "Labels2=" << test2_str << "\n";
 								  // encode kTestStringRupees.
-												More std::vector.

											
										
										
											2021-01-07 18:57:49 +08:00
+								  std::vector<int> labels3;
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
-												Replace remaining STRING by std::string in unittest

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 05:52:52 +08:00
+								  std::string test3_decoded = trainer2.DecodeLabels(labels3);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string test3_str(&test3_decoded[0], test3_decoded.length());
 								  LOG(INFO) << "labels3=" << test3_str << "\n";
 								  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
 								  // Since Tensor Flow's CTC implementation insists on having the null be the
 								  // last label, and we want to be compatible, null has to be renumbered when
 								  // we add a class.
 								  int null1 = trainer1.null_char();
 								  int null2 = trainer2.null_char();
 								  EXPECT_EQ(null1 + 1, null2);
 								  std::vector<int> labels1_v(labels1.size());
-												Fix some compiler warnings

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 16:01:54 +08:00
+								  for (unsigned i = 0; i < labels1.size(); ++i) {
-												Modernize code (clang-tidy -checks='-*,google-readability-braces-around-statements')

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 15:48:50 +08:00
+								    if (labels1[i] == null1) {
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								      labels1_v[i] = null2;
-												Modernize code (clang-tidy -checks='-*,google-readability-braces-around-statements')

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 15:48:50 +08:00
+								    } else {
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								      labels1_v[i] = labels1[i];
-												Modernize code (clang-tidy -checks='-*,google-readability-braces-around-statements')

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 15:48:50 +08:00
+								    }
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  }
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  // To make sure we we are not cheating somehow, we can now encode the Rupee
 								  // symbol, which we could not do before.
 								  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
 								  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
 								}
 								// Same as above test, for hin instead of eng
 								TEST(LangModelTest, AddACharacterHindi) {
 								  constexpr char kTestString[] = "हिन्दी में एक लाइन लिखें";
 								  constexpr char kTestStringRupees[] = "हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००";
 								  // Setup the arguments.
 								  std::string script_dir = LANGDATA_DIR;
 								  std::string hin_dir = file::JoinPath(script_dir, "hin");
 								  std::string unicharset_path = TestDataNameToPath("hin_beam.unicharset");
 								  UNICHARSET unicharset;
 								  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
 								  std::string version_str = "TestVersion";
-												Make tmp directory for all unit tests

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2020-12-31 01:17:58 +08:00
+								  file::MakeTmpdir();
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string output_dir = FLAGS_test_tmpdir;
 								  LOG(INFO) << "Output dir=" << output_dir << "\n";
 								  std::string lang1 = "hin";
 								  bool pass_through_recoder = false;
 								  // If these reads fail, we get a warning message and an empty list of words.
-												Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 03:36:20 +08:00
+								  std::vector<std::string> words = split(ReadFile(file::JoinPath(hin_dir, "hin.wordlist")), '\n');
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_GT(words.size(), 0);
-												Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 03:36:20 +08:00
+								  std::vector<std::string> puncs = split(ReadFile(file::JoinPath(hin_dir, "hin.punc")), '\n');
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_GT(puncs.size(), 0);
-												Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 03:36:20 +08:00
+								  std::vector<std::string> numbers = split(ReadFile(file::JoinPath(hin_dir, "hin.numbers")), '\n');
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_GT(numbers.size(), 0);
 								  bool lang_is_rtl = false;
 								  // Generate the traineddata file.
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
 								                                pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,
 								                                nullptr));
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  // Init a trainer with it, and encode kTestString.
-												unittest: Remove dependency on absl::StrCat()

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-08-06 17:38:05 +08:00
+								  std::string traineddata1 = file::JoinPath(output_dir, lang1, lang1) + ".traineddata";
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  LSTMTrainer trainer1;
 								  trainer1.InitCharSet(traineddata1);
-												More std::vector.

											
										
										
											2021-01-07 18:57:49 +08:00
+								  std::vector<int> labels1;
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
-												Replace remaining STRING by std::string in unittest

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 05:52:52 +08:00
+								  std::string test1_decoded = trainer1.DecodeLabels(labels1);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string test1_str(&test1_decoded[0], test1_decoded.length());
 								  LOG(INFO) << "Labels1=" << test1_str << "\n";
 								  // Add a new character to the unicharset and try again.
 								  int size_before = unicharset.size();
 								  unicharset.unichar_insert("₹");
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_EQ(size_before + 1, unicharset.size());
 								  // Generate the traineddata file.
 								  std::string lang2 = "extendedhin";
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
 								                                           pass_through_recoder, words, puncs, numbers, lang_is_rtl,
 								                                           nullptr, nullptr));
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  // Init a trainer with it, and encode kTestString.
-												unittest: Remove dependency on absl::StrCat()

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-08-06 17:38:05 +08:00
+								  std::string traineddata2 = file::JoinPath(output_dir, lang2, lang2) + ".traineddata";
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  LSTMTrainer trainer2;
 								  trainer2.InitCharSet(traineddata2);
-												More std::vector.

											
										
										
											2021-01-07 18:57:49 +08:00
+								  std::vector<int> labels2;
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
-												Replace remaining STRING by std::string in unittest

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 05:52:52 +08:00
+								  std::string test2_decoded = trainer2.DecodeLabels(labels2);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string test2_str(&test2_decoded[0], test2_decoded.length());
 								  LOG(INFO) << "Labels2=" << test2_str << "\n";
 								  // encode kTestStringRupees.
-												More std::vector.

											
										
										
											2021-01-07 18:57:49 +08:00
+								  std::vector<int> labels3;
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
-												Replace remaining STRING by std::string in unittest

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-15 05:52:52 +08:00
+								  std::string test3_decoded = trainer2.DecodeLabels(labels3);
-												Partially fix and enable more unittests
Add more subtests to langmodel_test

Add more subtests to langmodel_test

fix and enable lstmtrainer_test

fix and enable some subtests from recodebeam_test

partial fix for resultiterator_test

fix typo removing the terminating linefeed.

fix typo

changes

											
										
										
											2019-01-25 22:05:57 +08:00
+								  std::string test3_str(&test3_decoded[0], test3_decoded.length());
 								  LOG(INFO) << "labels3=" << test3_str << "\n";
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
 								  // Since Tensor Flow's CTC implementation insists on having the null be the
 								  // last label, and we want to be compatible, null has to be renumbered when
 								  // we add a class.
 								  int null1 = trainer1.null_char();
 								  int null2 = trainer2.null_char();
 								  EXPECT_EQ(null1 + 1, null2);
 								  std::vector<int> labels1_v(labels1.size());
-												Fix some compiler warnings

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 16:01:54 +08:00
+								  for (unsigned i = 0; i < labels1.size(); ++i) {
-												Modernize code (clang-tidy -checks='-*,google-readability-braces-around-statements')

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 15:48:50 +08:00
+								    if (labels1[i] == null1) {
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								      labels1_v[i] = null2;
-												Modernize code (clang-tidy -checks='-*,google-readability-braces-around-statements')

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 15:48:50 +08:00
+								    } else {
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								      labels1_v[i] = labels1[i];
-												Modernize code (clang-tidy -checks='-*,google-readability-braces-around-statements')

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2021-03-22 15:48:50 +08:00
+								    }
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  }
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								  EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
-												unittest: Add lang_model_test (only works partially)

The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-10-12 17:29:45 +08:00
+								  // To make sure we we are not cheating somehow, we can now encode the Rupee
-												Add more unittests from Google

They were provided by Jeff Breidenbach <jbreiden@google.com>.

Signed-off-by: Stefan Weil <sw@weilnetz.de>

											
										
										
											2018-08-24 21:07:48 +08:00
+								  // symbol, which we could not do before.
 								  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
 								  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
 								}
-												[clang-format] Format unit tests.

											
										
										
											2021-03-13 05:06:34 +08:00
+								} // namespace tesseract