diff --git a/unittest/apiexample_test.cc b/unittest/apiexample_test.cc index eecd861e..76373b8c 100644 --- a/unittest/apiexample_test.cc +++ b/unittest/apiexample_test.cc @@ -22,21 +22,23 @@ // expects clone of tessdata_fast repo in ../../tessdata_fast //#include "log.h" +#include +#include #include #include #include #include -#include // std::unique_ptr +#include // std::unique_ptr #include -#include #include "include_gunit.h" -#include namespace tesseract { class QuickTest : public testing::Test { - protected: - virtual void SetUp() { start_time_ = time(nullptr); } +protected: + virtual void SetUp() { + start_time_ = time(nullptr); + } virtual void TearDown() { #ifndef NDEBUG // Debug builds can be very slow, so allow 4 min for OCR of a test image. @@ -49,71 +51,62 @@ class QuickTest : public testing::Test { #endif const time_t end_time = time(nullptr); EXPECT_TRUE(end_time - start_time_ <= MAX_SECONDS_FOR_TEST) - << "The test took too long - " - << ::testing::PrintToString(end_time - start_time_); + << "The test took too long - " << ::testing::PrintToString(end_time - start_time_); } time_t start_time_; }; -void OCRTester(const char* imgname, const char* groundtruth, - const char* tessdatadir, const char* lang) { +void OCRTester(const char *imgname, const char *groundtruth, const char *tessdatadir, + const char *lang) { // log.info() << tessdatadir << " for language: " << lang << std::endl; - char* outText; - std::locale loc("C"); // You can also use "" for the default system locale + char *outText; + std::locale loc("C"); // You can also use "" for the default system locale std::ifstream file(groundtruth); - file.imbue(loc); // Use it for file input - std::string gtText((std::istreambuf_iterator(file)), - std::istreambuf_iterator()); + file.imbue(loc); // Use it for file input + std::string gtText((std::istreambuf_iterator(file)), std::istreambuf_iterator()); std::unique_ptr api(new tesseract::TessBaseAPI()); - ASSERT_FALSE(api->Init(tessdatadir, lang)) - << "Could not initialize tesseract."; - Pix* image = pixRead(imgname); + ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract."; + Pix *image = pixRead(imgname); ASSERT_TRUE(image != nullptr) << "Failed to read test image."; api->SetImage(image); outText = api->GetUTF8Text(); - EXPECT_EQ(gtText, outText) - << "Phototest.tif OCR does not match ground truth for " - << ::testing::PrintToString(lang); + EXPECT_EQ(gtText, outText) << "Phototest.tif OCR does not match ground truth for " + << ::testing::PrintToString(lang); api->End(); delete[] outText; pixDestroy(&image); } -class MatchGroundTruth : public QuickTest, - public ::testing::WithParamInterface {}; +class MatchGroundTruth : public QuickTest, public ::testing::WithParamInterface {}; TEST_P(MatchGroundTruth, FastPhototestOCR) { - OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", - TESSDATA_DIR "_fast", GetParam()); + OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", TESSDATA_DIR "_fast", + GetParam()); } TEST_P(MatchGroundTruth, BestPhototestOCR) { - OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", - TESSDATA_DIR "_best", GetParam()); + OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", TESSDATA_DIR "_best", + GetParam()); } TEST_P(MatchGroundTruth, TessPhototestOCR) { - OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", - TESSDATA_DIR, GetParam()); + OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", TESSDATA_DIR, GetParam()); } INSTANTIATE_TEST_SUITE_P(Eng, MatchGroundTruth, ::testing::Values("eng")); -INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth, - ::testing::Values("script/Latin")); -INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth, - ::testing::Values("script/Devanagari")); -INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth, - ::testing::Values("script/Arabic")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth, ::testing::Values("script/Latin")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth, ::testing::Values("script/Devanagari")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth, ::testing::Values("script/Arabic")); class EuroText : public QuickTest {}; TEST_F(EuroText, FastLatinOCR) { - OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt", - TESSDATA_DIR "_fast", "script/Latin"); + OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt", TESSDATA_DIR "_fast", + "script/Latin"); } // script/Latin for eurotext.tif does not match groundtruth // for tessdata & tessdata_best. // so do not test these here. -} // namespace +} // namespace tesseract diff --git a/unittest/applybox_test.cc b/unittest/applybox_test.cc index 33ca5577..6b01e6df 100644 --- a/unittest/applybox_test.cc +++ b/unittest/applybox_test.cc @@ -9,32 +9,38 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include +#include +#include #include "boxread.h" #include "rect.h" -#include #include "include_gunit.h" namespace tesseract { -const char* kTruthTextWords = "To simple burn running of goods lately.\n"; -const char* kTruthTextLine = "Tosimpleburnrunningofgoodslately.\n"; +const char *kTruthTextWords = "To simple burn running of goods lately.\n"; +const char *kTruthTextLine = "Tosimpleburnrunningofgoodslately.\n"; // The fixture for testing Tesseract. class ApplyBoxTest : public testing::Test { - protected: - std::string TestDataNameToPath(const std::string& name) { +protected: + std::string TestDataNameToPath(const std::string &name) { return file::JoinPath(TESTING_DIR, name); } - std::string TessdataPath() { return TESSDATA_DIR; } + std::string TessdataPath() { + return TESSDATA_DIR; + } - ApplyBoxTest() { src_pix_ = nullptr; } - ~ApplyBoxTest() { pixDestroy(&src_pix_); } + ApplyBoxTest() { + src_pix_ = nullptr; + } + ~ApplyBoxTest() { + pixDestroy(&src_pix_); + } - bool SetImage(const char* filename) { + bool SetImage(const char *filename) { bool found = false; pixDestroy(&src_pix_); src_pix_ = pixRead(TestDataNameToPath(filename).c_str()); @@ -53,8 +59,8 @@ class ApplyBoxTest : public testing::Test { // the boxes match the given box file well enough. // If line_mode is true, ApplyBoxes is run in line segmentation mode, // otherwise the input box file is assumed to have character-level boxes. - void VerifyBoxesAndText(const char* imagefile, const char* truth_str, - const char* target_box_file, bool line_mode) { + void VerifyBoxesAndText(const char *imagefile, const char *truth_str, const char *target_box_file, + bool line_mode) { if (!SetImage(imagefile)) { // eng.traineddata not found or other problem during Init. GTEST_SKIP(); @@ -65,64 +71,58 @@ class ApplyBoxTest : public testing::Test { else api_.SetVariable("tessedit_resegment_from_boxes", "1"); api_.Recognize(nullptr); - char* ocr_text = api_.GetUTF8Text(); + char *ocr_text = api_.GetUTF8Text(); EXPECT_STREQ(truth_str, ocr_text); delete[] ocr_text; // Test the boxes by reading the target box file in parallel with the // bounding boxes in the ocr output. std::string box_filename = TestDataNameToPath(target_box_file); - FILE* box_file = OpenBoxFile(box_filename.c_str()); + FILE *box_file = OpenBoxFile(box_filename.c_str()); ASSERT_TRUE(box_file != nullptr); int height = pixGetHeight(src_pix_); - ResultIterator* it = api_.GetIterator(); + ResultIterator *it = api_.GetIterator(); do { int left, top, right, bottom; - EXPECT_TRUE( - it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom)); + EXPECT_TRUE(it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom)); TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top)); int line_number = 0; TBOX truth_box; STRING box_text; - EXPECT_TRUE( - ReadNextBox(0, &line_number, box_file, &box_text, &truth_box)); + EXPECT_TRUE(ReadNextBox(0, &line_number, box_file, &box_text, &truth_box)); // Testing for major overlap is a bit weak, but if they all // major overlap successfully, then it has to be fairly close. EXPECT_TRUE(ocr_box.major_overlap(truth_box)); // Also check that the symbol text matches the box text. - char* symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL); + char *symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL); EXPECT_STREQ(box_text.c_str(), symbol_text); delete[] symbol_text; } while (it->Next(tesseract::RIL_SYMBOL)); delete it; } - Pix* src_pix_; + Pix *src_pix_; std::string ocr_text_; tesseract::TessBaseAPI api_; }; // Tests character-level applyboxes on normal Times New Roman. TEST_F(ApplyBoxTest, TimesCharLevel) { - VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box", - false); + VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box", false); } // Tests character-level applyboxes on italic Times New Roman. TEST_F(ApplyBoxTest, ItalicCharLevel) { - VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box", - false); + VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box", false); } // Tests line-level applyboxes on normal Times New Roman. TEST_F(ApplyBoxTest, TimesLineLevel) { - VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine, - "trainingtimes.box", true); + VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine, "trainingtimes.box", true); } // Tests line-level applyboxes on italic Times New Roman. TEST_F(ApplyBoxTest, ItalLineLevel) { - VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box", - true); + VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box", true); } -} // namespace +} // namespace tesseract diff --git a/unittest/baseapi_test.cc b/unittest/baseapi_test.cc index 6b05f3b6..c70e8d6c 100644 --- a/unittest/baseapi_test.cc +++ b/unittest/baseapi_test.cc @@ -33,21 +33,20 @@ namespace tesseract { using ::testing::ContainsRegex; using ::testing::HasSubstr; -static const char* langs[] = {"eng", "vie", "hin", "ara", nullptr}; -static const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif", - "arabic.tif", nullptr}; -static const char* gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", - "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c", - "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", - nullptr}; +static const char *langs[] = {"eng", "vie", "hin", "ara", nullptr}; +static const char *image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif", "arabic.tif", + nullptr}; +static const char *gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", + "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c", + "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr}; class FriendlyTessBaseAPI : public tesseract::TessBaseAPI { FRIEND_TEST(TesseractTest, LSTMGeometryTest); }; -std::string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) { +std::string GetCleanedTextResult(tesseract::TessBaseAPI *tess, Pix *pix) { tess->SetImage(pix); - char* result = tess->GetUTF8Text(); + char *result = tess->GetUTF8Text(); std::string ocr_result = result; delete[] result; absl::StripAsciiWhitespace(&ocr_result); @@ -56,8 +55,8 @@ std::string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) { // The fixture for testing Tesseract. class TesseractTest : public testing::Test { - protected: - static std::string TestDataNameToPath(const std::string& name) { +protected: + static std::string TestDataNameToPath(const std::string &name) { return file::JoinPath(TESTING_DIR, name); } static std::string TessdataPath() { @@ -71,11 +70,11 @@ TEST_F(TesseractTest, BasicTesseractTest) { std::string truth_text; std::string ocr_text; if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) { - Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str()); + Pix *src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str()); CHECK(src_pix); ocr_text = GetCleanedTextResult(&api, src_pix); - CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"), - &truth_text, file::Defaults())); + CHECK_OK( + file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults())); absl::StripAsciiWhitespace(&truth_text); EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str()); pixDestroy(&src_pix); @@ -123,10 +122,10 @@ TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) { GTEST_SKIP(); return; } - Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str()); + Pix *src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str()); CHECK(src_pix); api.SetImage(src_pix); - char* result = api.GetHOCRText(0); + char *result = api.GetHOCRText(0); EXPECT_TRUE(result != nullptr); EXPECT_THAT(result, HasSubstr("Hello")); EXPECT_THAT(result, HasSubstr("
]* baseline [-.0-9]+ [-.0-9]+" })); + EXPECT_TRUE(std::regex_search( + result, std::regex{"]* baseline [-.0-9]+ [-.0-9]+"})); delete[] result; pixDestroy(&src_pix); @@ -161,15 +161,14 @@ TEST_F(TesseractTest, AdaptToWordStrTest) { // Skip test because TessBaseAPI::AdaptToWordStr is missing. GTEST_SKIP(); #else - static const char* kTrainingPages[] = { - "136.tif", "256.tif", "410.tif", "432.tif", "540.tif", - "692.tif", "779.tif", "793.tif", "808.tif", "815.tif", - "12.tif", "12.tif", nullptr}; - static const char* kTrainingText[] = { - "1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0", "6 9 2", "7 7 9", - "7 9 3", "8 0 8", "8 1 5", "1 2", "1 2", nullptr}; - static const char* kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr}; - static const char* kTestText[] = {"324", "433", "12", nullptr}; + static const char *kTrainingPages[] = {"136.tif", "256.tif", "410.tif", "432.tif", "540.tif", + "692.tif", "779.tif", "793.tif", "808.tif", "815.tif", + "12.tif", "12.tif", nullptr}; + static const char *kTrainingText[] = {"1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0", + "6 9 2", "7 7 9", "7 9 3", "8 0 8", "8 1 5", + "1 2", "1 2", nullptr}; + static const char *kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr}; + static const char *kTestText[] = {"324", "433", "12", nullptr}; tesseract::TessBaseAPI api; std::string truth_text; std::string ocr_text; @@ -183,20 +182,18 @@ TEST_F(TesseractTest, AdaptToWordStrTest) { // Train on the training text. for (int i = 0; kTrainingPages[i] != nullptr; ++i) { std::string image_file = TestDataNameToPath(kTrainingPages[i]); - Pix* src_pix = pixRead(image_file.c_str()); + Pix *src_pix = pixRead(image_file.c_str()); CHECK(src_pix); api.SetImage(src_pix); - EXPECT_TRUE( - api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i])) - << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image " - << image_file; + EXPECT_TRUE(api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i])) + << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image " << image_file; pixDestroy(&src_pix); } // Test the test text. api.SetVariable("tess_bn_matching", "1"); api.SetPageSegMode(tesseract::PSM_SINGLE_WORD); for (int i = 0; kTestPages[i] != nullptr; ++i) { - Pix* src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str()); + Pix *src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str()); CHECK(src_pix); ocr_text = GetCleanedTextResult(&api, src_pix); absl::StripAsciiWhitespace(&truth_text); @@ -216,11 +213,11 @@ TEST_F(TesseractTest, BasicLSTMTest) { GTEST_SKIP(); return; } - Pix* src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str()); + Pix *src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str()); CHECK(src_pix); ocr_text = GetCleanedTextResult(&api, src_pix); - CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"), - &truth_text, file::Defaults())); + CHECK_OK( + file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults())); absl::StripAsciiWhitespace(&truth_text); EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str()); pixDestroy(&src_pix); @@ -233,7 +230,7 @@ TEST_F(TesseractTest, BasicLSTMTest) { // errors due to float/int conversions (e.g., see OUTLINE::move() in // ccstruct/poutline.h) Instead, we do a loose check. TEST_F(TesseractTest, LSTMGeometryTest) { - Pix* src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str()); + Pix *src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str()); FriendlyTessBaseAPI api; if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) { // eng.traineddata not found. @@ -243,16 +240,15 @@ TEST_F(TesseractTest, LSTMGeometryTest) { api.SetImage(src_pix); ASSERT_EQ(api.Recognize(nullptr), 0); - const PAGE_RES* page_res = api.GetPageRes(); - PAGE_RES_IT page_res_it(const_cast(page_res)); + const PAGE_RES *page_res = api.GetPageRes(); + PAGE_RES_IT page_res_it(const_cast(page_res)); page_res_it.restart_page(); - BLOCK* block = page_res_it.block()->block; + BLOCK *block = page_res_it.block()->block; CHECK(block); // extract word and character boxes for each word - for (page_res_it.restart_page(); page_res_it.word() != nullptr; - page_res_it.forward()) { - WERD_RES* word = page_res_it.word(); + for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); CHECK(word); CHECK(word->best_choice); CHECK_GT(word->best_choice->length(), 0); @@ -279,17 +275,15 @@ TEST_F(TesseractTest, LSTMGeometryTest) { TEST_F(TesseractTest, InitConfigOnlyTest) { // Languages for testing initialization. - const char* langs[] = {"eng", "chi_tra", "jpn", "vie"}; + const char *langs[] = {"eng", "chi_tra", "jpn", "vie"}; std::unique_ptr api; CycleTimer timer; for (size_t i = 0; i < countof(langs); ++i) { api.reset(new tesseract::TessBaseAPI); timer.Restart(); - EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], - tesseract::OEM_TESSERACT_ONLY)); + EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], tesseract::OEM_TESSERACT_ONLY)); timer.Stop(); - LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() - << "ms in regular init"; + LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() << "ms in regular init"; } // Init variables to set for config-only initialization. std::vector vars_vec, vars_values; @@ -299,12 +293,10 @@ TEST_F(TesseractTest, InitConfigOnlyTest) { for (size_t i = 0; i < countof(langs); ++i) { api.reset(new tesseract::TessBaseAPI); timer.Restart(); - EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], - tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec, - &vars_values, false)); + EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], tesseract::OEM_TESSERACT_ONLY, nullptr, + 0, &vars_vec, &vars_values, false)); timer.Stop(); - LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() - << "ms in config-only init"; + LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() << "ms in config-only init"; } } @@ -316,12 +308,13 @@ TEST_F(TesseractTest, InitConfigOnlyTest) { // OEM_DEFAULT mode. TEST(TesseractInstanceTest, TestMultipleTessInstances) { int num_langs = 0; - while (langs[num_langs] != nullptr) ++num_langs; + while (langs[num_langs] != nullptr) + ++num_langs; const std::string kTessdataPath = TESSDATA_DIR; // Preload images and verify that OCR is correct on them individually. - std::vector pix(num_langs); + std::vector pix(num_langs); for (int i = 0; i < num_langs; ++i) { SCOPED_TRACE(absl::StrCat("Single instance test with lang = ", langs[i])); std::string path = file::JoinPath(TESTING_DIR, image_files[i]); @@ -351,7 +344,8 @@ TEST(TesseractInstanceTest, TestMultipleTessInstances) { } } - for (int i = 0; i < num_langs; ++i) pixDestroy(&pix[i]); + for (int i = 0; i < num_langs; ++i) + pixDestroy(&pix[i]); } // Tests whether Tesseract parameters are correctly set for the two instances. @@ -374,7 +368,7 @@ TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) { tesseract::TessBaseAPI tess1, tess2; for (int i = 0; i < 2; ++i) { - tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2; + tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2; api->Init(kTessdataPath.c_str(), langs[i].c_str()); api->SetVariable(illegal_name.c_str(), "none"); api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str()); @@ -383,7 +377,7 @@ TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) { api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str()); } for (int i = 0; i < 2; ++i) { - tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2; + tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2; EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str())); int intvar; EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar)); @@ -391,12 +385,11 @@ TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) { bool boolvar; EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar)); EXPECT_EQ(bool_param[i], boolvar); - EXPECT_STREQ(str_param[i].c_str(), - api->GetStringVariable(str_param_name.c_str())); + EXPECT_STREQ(str_param[i].c_str(), api->GetStringVariable(str_param_name.c_str())); double doublevar; EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar)); EXPECT_EQ(double_param[i], doublevar); } } -} // namespace +} // namespace tesseract diff --git a/unittest/baseapi_thread_test.cc b/unittest/baseapi_thread_test.cc index a8c704a3..51858a53 100644 --- a/unittest/baseapi_thread_test.cc +++ b/unittest/baseapi_thread_test.cc @@ -24,13 +24,13 @@ #include #include #ifdef INCLUDE_TENSORFLOW -#include +# include #endif -#include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace #include -#include "include_gunit.h" #include +#include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace #include "commandlineflags.h" +#include "include_gunit.h" #include "log.h" // Run with Tesseract instances. @@ -46,25 +46,23 @@ BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances"); INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run."); INT_PARAM_FLAG(max_concurrent_instances, 0, - "Maximum number of instances to run in parallel at any given " - "instant. The number of concurrent instances cannot exceed " - "reps * number_of_langs_tested, which is also the default value."); + "Maximum number of instances to run in parallel at any given " + "instant. The number of concurrent instances cannot exceed " + "reps * number_of_langs_tested, which is also the default value."); namespace tesseract { -static const char* kTessLangs[] = {"eng", "vie", nullptr}; -static const char* kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr}; -static const char* kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", - nullptr}; +static const char *kTessLangs[] = {"eng", "vie", nullptr}; +static const char *kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr}; +static const char *kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", nullptr}; -static const char* kCubeLangs[] = {"hin", "ara", nullptr}; -static const char* kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr}; -static const char* kCubeTruthText[] = { - "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c", - "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr}; +static const char *kCubeLangs[] = {"hin", "ara", nullptr}; +static const char *kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr}; +static const char *kCubeTruthText[] = {"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c", + "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr}; class BaseapiThreadTest : public ::testing::Test { - protected: +protected: static void SetUpTestCase() { CHECK(FLAGS_test_tesseract || FLAGS_test_cube) << "Need to test at least one of Tesseract/Cube!"; @@ -99,35 +97,37 @@ class BaseapiThreadTest : public ::testing::Test { const int n = num_langs_ * FLAGS_reps; for (int i = 0; i < n; ++i) { std::string path = TESTING_DIR "/" + image_files[i % num_langs_]; - Pix* new_pix = pixRead(path.c_str()); + Pix *new_pix = pixRead(path.c_str()); QCHECK(new_pix != nullptr) << "Could not read " << path; pix_.push_back(new_pix); } #ifdef INCLUDE_TENSORFLOW - pool_size_ = (FLAGS_max_concurrent_instances < 1) - ? num_langs_ * FLAGS_reps - : FLAGS_max_concurrent_instances; + pool_size_ = (FLAGS_max_concurrent_instances < 1) ? num_langs_ * FLAGS_reps + : FLAGS_max_concurrent_instances; #endif } static void TearDownTestCase() { - for (auto& pix : pix_) { + for (auto &pix : pix_) { pixDestroy(&pix); } } #ifdef INCLUDE_TENSORFLOW void ResetPool() { - pool_.reset(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_)); + pool_.reset( + new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_)); } - void WaitForPoolWorkers() { pool_.reset(nullptr); } + void WaitForPoolWorkers() { + pool_.reset(nullptr); + } std::unique_ptr pool_; static int pool_size_; #endif - static std::vector pix_; + static std::vector pix_; static std::vector langs_; static std::vector gt_text_; static int num_langs_; @@ -137,27 +137,27 @@ class BaseapiThreadTest : public ::testing::Test { #ifdef INCLUDE_TENSORFLOW int BaseapiThreadTest::pool_size_; #endif -std::vector BaseapiThreadTest::pix_; +std::vector BaseapiThreadTest::pix_; std::vector BaseapiThreadTest::langs_; std::vector BaseapiThreadTest::gt_text_; int BaseapiThreadTest::num_langs_; -static void InitTessInstance(TessBaseAPI* tess, const std::string& lang) { +static void InitTessInstance(TessBaseAPI *tess, const std::string &lang) { CHECK(tess != nullptr); EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str())); } -static void GetCleanedText(TessBaseAPI* tess, Pix* pix, std::string* ocr_text) { +static void GetCleanedText(TessBaseAPI *tess, Pix *pix, std::string *ocr_text) { tess->SetImage(pix); - char* result = tess->GetUTF8Text(); + char *result = tess->GetUTF8Text(); *ocr_text = result; delete[] result; absl::StripAsciiWhitespace(ocr_text); } -static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lang, - const std::string& expected_text) { - TessBaseAPI* tess_local = nullptr; +static void VerifyTextResult(TessBaseAPI *tess, Pix *pix, const std::string &lang, + const std::string &expected_text) { + TessBaseAPI *tess_local = nullptr; if (tess) { tess_local = tess; } else { @@ -167,7 +167,8 @@ static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lan std::string ocr_text; GetCleanedText(tess_local, pix, &ocr_text); EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str()); - if (tess_local != tess) delete tess_local; + if (tess_local != tess) + delete tess_local; } // Check that Tesseract/Cube produce the correct results in single-threaded @@ -178,8 +179,7 @@ TEST_F(BaseapiThreadTest, TestBasicSanity) { InitTessInstance(&tess, langs_[i]); std::string ocr_text; GetCleanedText(&tess, pix_[i], &ocr_text); - CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) - << "Failed with lang = " << langs_[i]; + CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) << "Failed with lang = " << langs_[i]; } } @@ -208,8 +208,8 @@ TEST_F(BaseapiThreadTest, TestRecognition) { ResetPool(); for (int i = 0; i < n; ++i) { - pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], - langs_[i % num_langs_], gt_text_[i % num_langs_])); + pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], langs_[i % num_langs_], + gt_text_[i % num_langs_])); } WaitForPoolWorkers(); #endif @@ -220,10 +220,10 @@ TEST_F(BaseapiThreadTest, TestAll) { const int n = num_langs_ * FLAGS_reps; ResetPool(); for (int i = 0; i < n; ++i) { - pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], - langs_[i % num_langs_], gt_text_[i % num_langs_])); + pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], langs_[i % num_langs_], + gt_text_[i % num_langs_])); } WaitForPoolWorkers(); #endif } -} // namespace +} // namespace tesseract diff --git a/unittest/bitvector_test.cc b/unittest/bitvector_test.cc index 9be718a0..94f573ea 100644 --- a/unittest/bitvector_test.cc +++ b/unittest/bitvector_test.cc @@ -22,38 +22,41 @@ const int kPrimeLimit = 1000; namespace tesseract { class BitVectorTest : public testing::Test { - protected: +protected: void SetUp() override { std::locale::global(std::locale("")); file::MakeTmpdir(); } - public: - std::string OutputNameToPath(const std::string& name) { +public: + std::string OutputNameToPath(const std::string &name) { return file::JoinPath(FLAGS_test_tmpdir, name); } // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes. - void ComputePrimes(BitVector* map) { + void ComputePrimes(BitVector *map) { map->Init(kPrimeLimit + 1); TestAll(*map, false); map->SetBit(2); // Set all the odds to true. - for (int i = 3; i <= kPrimeLimit; i += 2) map->SetValue(i, true); + for (int i = 3; i <= kPrimeLimit; i += 2) + map->SetValue(i, true); int factor_limit = static_cast(sqrt(1.0 + kPrimeLimit)); for (int f = 3; f <= factor_limit; f += 2) { if (map->At(f)) { - for (int m = 2; m * f <= kPrimeLimit; ++m) map->ResetBit(f * m); + for (int m = 2; m * f <= kPrimeLimit; ++m) + map->ResetBit(f * m); } } } - void TestPrimes(const BitVector& map) { + void TestPrimes(const BitVector &map) { // Now all primes in the vector are true, and all others false. // According to Wikipedia, there are 168 primes under 1000, the last // of which is 997. int total_primes = 0; for (int i = 0; i <= kPrimeLimit; ++i) { - if (map[i]) ++total_primes; + if (map[i]) + ++total_primes; } EXPECT_EQ(168, total_primes); EXPECT_TRUE(map[997]); @@ -61,7 +64,7 @@ class BitVectorTest : public testing::Test { EXPECT_FALSE(map[999]); } // Test that all bits in the vector have the given value. - void TestAll(const BitVector& map, bool value) { + void TestAll(const BitVector &map, bool value) { for (int i = 0; i < map.size(); ++i) { EXPECT_EQ(value, map[i]); } @@ -70,7 +73,7 @@ class BitVectorTest : public testing::Test { // Sets up a BitVector with bit patterns for byte values in // [start_byte, end_byte) positioned every spacing bytes (for spacing >= 1) // with spacing-1 zero bytes in between the pattern bytes. - void SetBitPattern(int start_byte, int end_byte, int spacing, BitVector* bv) { + void SetBitPattern(int start_byte, int end_byte, int spacing, BitVector *bv) { bv->Init((end_byte - start_byte) * 8 * spacing); for (int byte_value = start_byte; byte_value < end_byte; ++byte_value) { for (int bit = 0; bit < 8; ++bit) { @@ -82,7 +85,7 @@ class BitVectorTest : public testing::Test { // Expects that every return from NextSetBit is really set and that all others // are really not set. Checks the return from NumSetBits also. - void ExpectCorrectBits(const BitVector& bv) { + void ExpectCorrectBits(const BitVector &bv) { int bit_index = -1; int prev_bit_index = -1; int num_bits_tested = 0; @@ -119,7 +122,7 @@ TEST_F(BitVectorTest, Primes) { TestPrimes(map3); // Test file i/o too. std::string filename = OutputNameToPath("primesbitvector"); - FILE* fp = fopen(filename.c_str(), "wb"); + FILE *fp = fopen(filename.c_str(), "wb"); ASSERT_TRUE(fp != nullptr); EXPECT_TRUE(map.Serialize(fp)); fclose(fp); @@ -163,4 +166,4 @@ TEST_F(BitVectorTest, TestNumSetBits) { } } -} // namespace. +} // namespace tesseract diff --git a/unittest/capiexample_c_test.c b/unittest/capiexample_c_test.c index 5917f0c4..9783071d 100644 --- a/unittest/capiexample_c_test.c +++ b/unittest/capiexample_c_test.c @@ -14,8 +14,7 @@ #include // Verifies that the libtesseract library has C API symbols. -int main() -{ - printf("%s\n", TessVersion()); - return 0; +int main() { + printf("%s\n", TessVersion()); + return 0; } diff --git a/unittest/capiexample_test.cc b/unittest/capiexample_test.cc index 3c843056..262fbc7d 100644 --- a/unittest/capiexample_test.cc +++ b/unittest/capiexample_test.cc @@ -16,4 +16,6 @@ #include // Verifies that the libtesseract library has C API symbols. -TEST(C, VersionTest) { TessVersion(); } +TEST(C, VersionTest) { + TessVersion(); +} diff --git a/unittest/cleanapi_test.cc b/unittest/cleanapi_test.cc index 4d284af0..da8c502a 100644 --- a/unittest/cleanapi_test.cc +++ b/unittest/cleanapi_test.cc @@ -23,6 +23,8 @@ enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD }; namespace tesseract { // Verifies that the global namespace is clean. -TEST(CleanNamespaceTess, DummyTest) { tesseract::TessBaseAPI api; } +TEST(CleanNamespaceTess, DummyTest) { + tesseract::TessBaseAPI api; +} -} // namespace. +} // namespace tesseract diff --git a/unittest/colpartition_test.cc b/unittest/colpartition_test.cc index caebe605..1c0e5902 100644 --- a/unittest/colpartition_test.cc +++ b/unittest/colpartition_test.cc @@ -16,7 +16,7 @@ namespace tesseract { class TestableColPartition : public ColPartition { - public: +public: void SetColumnRange(int first, int last) { set_first_column(first); set_last_column(last); @@ -24,7 +24,7 @@ class TestableColPartition : public ColPartition { }; class ColPartitionTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } @@ -73,4 +73,4 @@ TEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) { EXPECT_TRUE(b.IsInSameColumnAs(a)); } -} // namespace +} // namespace tesseract diff --git a/unittest/commandlineflags_test.cc b/unittest/commandlineflags_test.cc index b5ada320..b192c234 100644 --- a/unittest/commandlineflags_test.cc +++ b/unittest/commandlineflags_test.cc @@ -30,22 +30,21 @@ STRING_PARAM_FLAG(q, "", "Single character name"); namespace tesseract { class CommandlineflagsTest : public ::testing::Test { - protected: - void TestParser(int argc, const char** const_argv) { +protected: + void TestParser(int argc, const char **const_argv) { TestParser("", argc, const_argv); } - void TestParser(const char* usage, int argc, const char** const_argv) { + void TestParser(const char *usage, int argc, const char **const_argv) { // Make a copy of the pointer since it can be altered by the function. - char** argv = const_cast(const_argv); + char **argv = const_cast(const_argv); tesseract::ParseCommandLineFlags(usage, &argc, &argv, true); } }; TEST_F(CommandlineflagsTest, RemoveFlags) { - const char* const_argv[] = {"Progname", "--foo_int", "3", "file1.h", - "file2.h"}; + const char *const_argv[] = {"Progname", "--foo_int", "3", "file1.h", "file2.h"}; int argc = countof(const_argv); - char** argv = const_cast(const_argv); + char **argv = const_cast(const_argv); tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); // argv should be rearranged to look like { "Progname", "file1.h", "file2.h" } @@ -55,7 +54,7 @@ TEST_F(CommandlineflagsTest, RemoveFlags) { EXPECT_STREQ("file2.h", argv[2]); } -#if 0 // TODO: this test needs an update (it currently fails). +#if 0 // TODO: this test needs an update (it currently fails). TEST_F(CommandlineflagsTest, PrintUsageAndExit) { const char* argv[] = { "Progname", "--help" }; EXPECT_EXIT(TestParser("Progname [flags]", countof(argv), argv), @@ -65,65 +64,62 @@ TEST_F(CommandlineflagsTest, PrintUsageAndExit) { #endif TEST_F(CommandlineflagsTest, ExitsWithErrorOnInvalidFlag) { - const char* argv[] = {"", "--test_nonexistent_flag"}; + const char *argv[] = {"", "--test_nonexistent_flag"}; EXPECT_EXIT(TestParser(countof(argv), argv), ::testing::ExitedWithCode(1), "ERROR: Non-existent flag"); } TEST_F(CommandlineflagsTest, ParseIntegerFlags) { - const char* argv[] = {"", "--foo_int=3", "--bar_int", "-4"}; + const char *argv[] = {"", "--foo_int=3", "--bar_int", "-4"}; TestParser(countof(argv), argv); EXPECT_EQ(3, FLAGS_foo_int); EXPECT_EQ(-4, FLAGS_bar_int); - const char* arg_no_value[] = {"", "--bar_int"}; - EXPECT_EXIT(TestParser(countof(arg_no_value), arg_no_value), - ::testing::ExitedWithCode(1), "ERROR"); + const char *arg_no_value[] = {"", "--bar_int"}; + EXPECT_EXIT(TestParser(countof(arg_no_value), arg_no_value), ::testing::ExitedWithCode(1), + "ERROR"); - const char* arg_invalid_value[] = {"", "--bar_int", "--foo_int=3"}; + const char *arg_invalid_value[] = {"", "--bar_int", "--foo_int=3"}; EXPECT_EXIT(TestParser(countof(arg_invalid_value), arg_invalid_value), ::testing::ExitedWithCode(1), "ERROR"); - const char* arg_bad_format[] = {"", "--bar_int="}; - EXPECT_EXIT(TestParser(countof(arg_bad_format), arg_bad_format), - ::testing::ExitedWithCode(1), "ERROR"); + const char *arg_bad_format[] = {"", "--bar_int="}; + EXPECT_EXIT(TestParser(countof(arg_bad_format), arg_bad_format), ::testing::ExitedWithCode(1), + "ERROR"); } TEST_F(CommandlineflagsTest, ParseDoubleFlags) { - const char* argv[] = {"", "--foo_double=3.14", "--bar_double", "1.2"}; + const char *argv[] = {"", "--foo_double=3.14", "--bar_double", "1.2"}; TestParser(countof(argv), argv); EXPECT_EQ(3.14, FLAGS_foo_double); EXPECT_EQ(1.2, FLAGS_bar_double); - const char* arg_no_value[] = {"", "--bar_double"}; - EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), - "ERROR"); + const char *arg_no_value[] = {"", "--bar_double"}; + EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), "ERROR"); - const char* arg_bad_format[] = {"", "--bar_double="}; - EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1), - "ERROR"); + const char *arg_bad_format[] = {"", "--bar_double="}; + EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1), "ERROR"); } TEST_F(CommandlineflagsTest, ParseStringFlags) { - const char* argv[] = {"", "--foo_string=abc", "--bar_string", "def"}; + const char *argv[] = {"", "--foo_string=abc", "--bar_string", "def"}; TestParser(countof(argv), argv); EXPECT_STREQ("abc", FLAGS_foo_string.c_str()); EXPECT_STREQ("def", FLAGS_bar_string.c_str()); - const char* arg_no_value[] = {"", "--bar_string"}; - EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), - "ERROR"); + const char *arg_no_value[] = {"", "--bar_string"}; + EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), "ERROR"); FLAGS_bar_string.set_value("bar"); - const char* arg_empty_string[] = {"", "--bar_string="}; + const char *arg_empty_string[] = {"", "--bar_string="}; TestParser(2, arg_empty_string); EXPECT_STREQ("", FLAGS_bar_string.c_str()); } TEST_F(CommandlineflagsTest, ParseBoolFlags) { - const char* argv[] = {"", "--foo_bool=true", "--bar_bool=1"}; + const char *argv[] = {"", "--foo_bool=true", "--bar_bool=1"}; FLAGS_foo_bool.set_value(false); FLAGS_bar_bool.set_value(false); TestParser(countof(argv), argv); @@ -131,7 +127,7 @@ TEST_F(CommandlineflagsTest, ParseBoolFlags) { EXPECT_TRUE(FLAGS_foo_bool); EXPECT_TRUE(FLAGS_bar_bool); - const char* inv_argv[] = {"", "--foo_bool=false", "--bar_bool=0"}; + const char *inv_argv[] = {"", "--foo_bool=false", "--bar_bool=0"}; FLAGS_foo_bool.set_value(true); FLAGS_bar_bool.set_value(true); TestParser(3, inv_argv); @@ -139,20 +135,19 @@ TEST_F(CommandlineflagsTest, ParseBoolFlags) { EXPECT_FALSE(FLAGS_foo_bool); EXPECT_FALSE(FLAGS_bar_bool); - const char* arg_implied_true[] = {"", "--bar_bool"}; + const char *arg_implied_true[] = {"", "--bar_bool"}; FLAGS_bar_bool.set_value(false); TestParser(2, arg_implied_true); EXPECT_TRUE(FLAGS_bar_bool); - const char* arg_missing_val[] = {"", "--bar_bool="}; - EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1), - "ERROR"); + const char *arg_missing_val[] = {"", "--bar_bool="}; + EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1), "ERROR"); } TEST_F(CommandlineflagsTest, ParseOldFlags) { EXPECT_STREQ("", FLAGS_q.c_str()); - const char* argv[] = {"", "-q", "text"}; + const char *argv[] = {"", "-q", "text"}; TestParser(countof(argv), argv); EXPECT_STREQ("text", FLAGS_q.c_str()); } -} // namespace +} // namespace tesseract diff --git a/unittest/cycletimer.h b/unittest/cycletimer.h index e1a13719..6a61a86b 100644 --- a/unittest/cycletimer.h +++ b/unittest/cycletimer.h @@ -45,17 +45,19 @@ public: running_ = false; } } - int64_t GetInMs() const { return GetNanos() / 1000000; } + int64_t GetInMs() const { + return GetNanos() / 1000000; + } - protected: +protected: int64_t GetNanos() const { return running_ ? absl::GetCurrentTimeNanos() - start_ + sum_ : sum_; } - private: +private: bool running_; int64_t start_; int64_t sum_; }; -#endif // TESSERACT_UNITTEST_CYCLETIMER_H +#endif // TESSERACT_UNITTEST_CYCLETIMER_H diff --git a/unittest/dawg_test.cc b/unittest/dawg_test.cc index 4a40b050..1a2989b5 100644 --- a/unittest/dawg_test.cc +++ b/unittest/dawg_test.cc @@ -12,19 +12,19 @@ #include "include_gunit.h" #include "ratngs.h" -#include "unicharset.h" #include "trie.h" +#include "unicharset.h" -#include // for system -#include // for ifstream +#include +#include // for system +#include // for ifstream #include #include #include -#include #ifndef SW_TESTING -#define wordlist2dawg_prog "wordlist2dawg" -#define dawg2wordlist_prog "dawg2wordlist" +# define wordlist2dawg_prog "wordlist2dawg" +# define dawg2wordlist_prog "dawg2wordlist" #endif namespace tesseract { @@ -32,13 +32,13 @@ namespace tesseract { // Test some basic functionality dealing with Dawgs (compressed dictionaries, // aka Directed Acyclic Word Graphs). class DawgTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); } - void LoadWordlist(const std::string& filename, std::set* words) const { + void LoadWordlist(const std::string &filename, std::set *words) const { std::ifstream file(filename); if (file.is_open()) { std::string line; @@ -53,34 +53,30 @@ class DawgTest : public testing::Test { file.close(); } } - std::string TessBinaryPath(const std::string& name) const { + std::string TessBinaryPath(const std::string &name) const { return file::JoinPath(TESSBIN_DIR, name); } - std::string OutputNameToPath(const std::string& name) const { + std::string OutputNameToPath(const std::string &name) const { return file::JoinPath(FLAGS_test_tmpdir, name); } - int RunCommand(const std::string& program, const std::string& arg1, - const std::string& arg2, const std::string& arg3) const { - std::string cmdline = - TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3; + int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2, + const std::string &arg3) const { + std::string cmdline = TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3; return system(cmdline.c_str()); } // Test that we are able to convert a wordlist file (one "word" per line) to // a dawg (a compressed format) and then extract the original wordlist back // out using the tools "wordlist2dawg" and "dawg2wordlist." - void TestDawgRoundTrip(const std::string& unicharset_filename, - const std::string& wordlist_filename) const { + void TestDawgRoundTrip(const std::string &unicharset_filename, + const std::string &wordlist_filename) const { std::set orig_words, roundtrip_words; std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename); std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename); std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg"); std::string output_wordlist = OutputNameToPath(wordlist_filename); LoadWordlist(orig_wordlist, &orig_words); - EXPECT_EQ( - RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0); - EXPECT_EQ( - RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), - 0); + EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0); + EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0); LoadWordlist(output_wordlist, &roundtrip_words); EXPECT_EQ(orig_words, roundtrip_words); } @@ -93,8 +89,7 @@ TEST_F(DawgTest, TestDawgConversion) { TEST_F(DawgTest, TestMatching) { UNICHARSET unicharset; unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str()); - tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, - unicharset.size(), 0); + tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, unicharset.size(), 0); WERD_CHOICE space_apos(" '", unicharset); trie.add_word_to_dawg(space_apos); @@ -112,4 +107,4 @@ TEST_F(DawgTest, TestMatching) { EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true)); } -} // namespace +} // namespace tesseract diff --git a/unittest/denorm_test.cc b/unittest/denorm_test.cc index 28328b15..d07eab46 100644 --- a/unittest/denorm_test.cc +++ b/unittest/denorm_test.cc @@ -17,16 +17,16 @@ namespace tesseract { class DENORMTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } - public: +public: void TearDown() {} - void ExpectCorrectTransform(const DENORM& denorm, const TPOINT& src, - const TPOINT& result, bool local) { + void ExpectCorrectTransform(const DENORM &denorm, const TPOINT &src, const TPOINT &result, + bool local) { TPOINT normed; if (local) denorm.LocalNormTransform(src, &normed); @@ -48,8 +48,8 @@ class DENORMTest : public testing::Test { // Tests a simple baseline-style normalization. TEST_F(DENORMTest, NoRotations) { DENORM denorm; - denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, - 0.0f, static_cast(kBlnBaselineOffset)); + denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f, + static_cast(kBlnBaselineOffset)); TPOINT pt1(1100, 2000); TPOINT result1(200, kBlnBaselineOffset); ExpectCorrectTransform(denorm, pt1, result1, true); @@ -64,8 +64,8 @@ TEST_F(DENORMTest, NoRotations) { TEST_F(DENORMTest, WithRotations) { DENORM denorm; FCOORD rotation90(0.0f, 1.0f); - denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f, - 3.0f, 0.0f, static_cast(kBlnBaselineOffset)); + denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f, + static_cast(kBlnBaselineOffset)); TPOINT pt1(1100, 2000); TPOINT result1(0, 200 + kBlnBaselineOffset); @@ -80,13 +80,13 @@ TEST_F(DENORMTest, WithRotations) { // Tests a simple baseline-style normalization with a second rotation & scale. TEST_F(DENORMTest, Multiple) { DENORM denorm; - denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, - 0.0f, static_cast(kBlnBaselineOffset)); + denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f, + static_cast(kBlnBaselineOffset)); DENORM denorm2; FCOORD rotation90(0.0f, 1.0f); - denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f, - 0.25f, 0.0f, 0.0f); + denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f, 0.25f, 0.0f, + 0.0f); TPOINT pt1(1050, 2000); TPOINT result1(100, kBlnBaselineOffset); ExpectCorrectTransform(denorm, pt1, result1, true); @@ -96,4 +96,4 @@ TEST_F(DENORMTest, Multiple) { ExpectCorrectTransform(denorm2, pt1, result2, false); } -} // namespace. +} // namespace tesseract diff --git a/unittest/doubleptr.h b/unittest/doubleptr.h index 38628b5f..dad275de 100644 --- a/unittest/doubleptr.h +++ b/unittest/doubleptr.h @@ -39,32 +39,32 @@ namespace tesseract { // can correctly maintain the pointer to an element of the heap despite it // getting moved around on the heap. class DoublePtr { - public: +public: DoublePtr() : other_end_(nullptr) {} // Copy constructor steals the partner off src and is therefore a non // const reference arg. // Copying a const DoublePtr generates a compiler error. - DoublePtr(const DoublePtr& src) { + DoublePtr(const DoublePtr &src) { other_end_ = src.other_end_; if (other_end_ != nullptr) { other_end_->other_end_ = this; - ((DoublePtr&)src).other_end_ = nullptr; + ((DoublePtr &)src).other_end_ = nullptr; } } // Operator= steals the partner off src, and therefore needs src to be a non- // const reference. // Assigning from a const DoublePtr generates a compiler error. - void operator=(const DoublePtr& src) { + void operator=(const DoublePtr &src) { Disconnect(); other_end_ = src.other_end_; if (other_end_ != nullptr) { other_end_->other_end_ = this; - ((DoublePtr&)src).other_end_ = nullptr; + ((DoublePtr &)src).other_end_ = nullptr; } } // Connects this and other, discarding any existing connections. - void Connect(DoublePtr* other) { + void Connect(DoublePtr *other) { other->Disconnect(); Disconnect(); other->other_end_ = this; @@ -78,16 +78,16 @@ class DoublePtr { } } // Returns the pointer to the other end of the double pointer. - DoublePtr* OtherEnd() const { + DoublePtr *OtherEnd() const { return other_end_; } - private: +private: // Pointer to the other end of the link. It is always true that either // other_end_ == nullptr or other_end_->other_end_ == this. - DoublePtr* other_end_; + DoublePtr *other_end_; }; -} // namespace tesseract. +} // namespace tesseract. -#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_ +#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_ diff --git a/unittest/equationdetect_test.cc b/unittest/equationdetect_test.cc index de82ab7d..255915cb 100644 --- a/unittest/equationdetect_test.cc +++ b/unittest/equationdetect_test.cc @@ -23,91 +23,88 @@ #define ENABLE_IdentifySpecialText_TEST 0 #if ENABLE_IdentifySpecialText_TEST -#define EQU_TRAINEDDATA_NAME "equ" +# define EQU_TRAINEDDATA_NAME "equ" #else -#define EQU_TRAINEDDATA_NAME "equINTENTIONALLY_MISSING_FILE" +# define EQU_TRAINEDDATA_NAME "equINTENTIONALLY_MISSING_FILE" #endif namespace tesseract { class TestableEquationDetect : public EquationDetect { - public: - TestableEquationDetect(const char* tessdata, Tesseract* lang_tesseract) +public: + TestableEquationDetect(const char *tessdata, Tesseract *lang_tesseract) : EquationDetect(tessdata, EQU_TRAINEDDATA_NAME) { SetLangTesseract(lang_tesseract); } // Insert a certain math and digit blobs into part. - void AddMathDigitBlobs(const int math_blobs, const int digit_blobs, - const int total_blobs, ColPartition* part) { + void AddMathDigitBlobs(const int math_blobs, const int digit_blobs, const int total_blobs, + ColPartition *part) { CHECK(part != nullptr); CHECK_LE(math_blobs + digit_blobs, total_blobs); int count = 0; for (int i = 0; i < math_blobs; i++, count++) { - BLOBNBOX* blob = new BLOBNBOX(); + BLOBNBOX *blob = new BLOBNBOX(); blob->set_special_text_type(BSTT_MATH); part->AddBox(blob); } for (int i = 0; i < digit_blobs; i++, count++) { - BLOBNBOX* blob = new BLOBNBOX(); + BLOBNBOX *blob = new BLOBNBOX(); blob->set_special_text_type(BSTT_DIGIT); part->AddBox(blob); } for (int i = count; i < total_blobs; i++) { - BLOBNBOX* blob = new BLOBNBOX(); + BLOBNBOX *blob = new BLOBNBOX(); blob->set_special_text_type(BSTT_NONE); part->AddBox(blob); } } // Set up pix_binary for lang_tesseract_. - void SetPixBinary(Pix* pix) { + void SetPixBinary(Pix *pix) { CHECK_EQ(1, pixGetDepth(pix)); *(lang_tesseract_->mutable_pix_binary()) = pix; } - void RunIdentifySpecialText(BLOBNBOX* blob, const int height_th) { + void RunIdentifySpecialText(BLOBNBOX *blob, const int height_th) { IdentifySpecialText(blob, height_th); } - BlobSpecialTextType RunEstimateTypeForUnichar(const char* val) { - const UNICHARSET& unicharset = lang_tesseract_->unicharset; + BlobSpecialTextType RunEstimateTypeForUnichar(const char *val) { + const UNICHARSET &unicharset = lang_tesseract_->unicharset; return EstimateTypeForUnichar(unicharset, unicharset.unichar_to_id(val)); } - EquationDetect::IndentType RunIsIndented(ColPartitionGrid* part_grid, - ColPartition* part) { + EquationDetect::IndentType RunIsIndented(ColPartitionGrid *part_grid, ColPartition *part) { this->part_grid_ = part_grid; return IsIndented(part); } - bool RunIsNearSmallNeighbor(const TBOX& seed_box, const TBOX& part_box) { + bool RunIsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) { return IsNearSmallNeighbor(seed_box, part_box); } - bool RunCheckSeedBlobsCount(ColPartition* part) { + bool RunCheckSeedBlobsCount(ColPartition *part) { return CheckSeedBlobsCount(part); } - float RunComputeForegroundDensity(const TBOX& tbox) { + float RunComputeForegroundDensity(const TBOX &tbox) { return ComputeForegroundDensity(tbox); } - int RunCountAlignment(const GenericVector& sorted_vec, const int val) { + int RunCountAlignment(const GenericVector &sorted_vec, const int val) { return CountAlignment(sorted_vec, val); } - void RunSplitCPHorLite(ColPartition* part, - GenericVector* splitted_boxes) { + void RunSplitCPHorLite(ColPartition *part, GenericVector *splitted_boxes) { SplitCPHorLite(part, splitted_boxes); } - void RunSplitCPHor(ColPartition* part, - GenericVector* parts_splitted) { + void RunSplitCPHor(ColPartition *part, GenericVector *parts_splitted) { SplitCPHor(part, parts_splitted); } - void TestComputeCPsSuperBBox(const TBOX& box, ColPartitionGrid* part_grid) { + void TestComputeCPsSuperBBox(const TBOX &box, ColPartitionGrid *part_grid) { CHECK(part_grid != nullptr); part_grid_ = part_grid; ComputeCPsSuperBBox(); @@ -116,7 +113,7 @@ class TestableEquationDetect : public EquationDetect { }; class EquationFinderTest : public testing::Test { - protected: +protected: std::unique_ptr equation_det_; std::unique_ptr tesseract_; @@ -128,8 +125,7 @@ class EquationFinderTest : public testing::Test { tesseract_.reset(new Tesseract()); tesseract_->init_tesseract(TESSDATA_DIR, "eng", OEM_TESSERACT_ONLY); tesseract_->set_source_resolution(300); - equation_det_.reset( - new TestableEquationDetect(TESSDATA_DIR, tesseract_.get())); + equation_det_.reset(new TestableEquationDetect(TESSDATA_DIR, tesseract_.get())); equation_det_->SetResolution(300); testdata_dir_ = TESTDATA_DIR; @@ -141,34 +137,31 @@ class EquationFinderTest : public testing::Test { } // Add a BLOCK covering the whole page. - void AddPageBlock(Pix* pix, BLOCK_LIST* blocks) { + void AddPageBlock(Pix *pix, BLOCK_LIST *blocks) { CHECK(pix != nullptr); CHECK(blocks != nullptr); BLOCK_IT block_it(blocks); - BLOCK* block = - new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix)); + BLOCK *block = new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix)); block_it.add_to_end(block); } // Create col partitions, add into part_grid, and put them into all_parts. - void CreateColParts(const int rows, const int cols, - ColPartitionGrid* part_grid, - std::vector* all_parts) { + void CreateColParts(const int rows, const int cols, ColPartitionGrid *part_grid, + std::vector *all_parts) { const int kWidth = 10, kHeight = 10; ClearParts(all_parts); for (int y = 0; y < rows; ++y) { for (int x = 0; x < cols; ++x) { int left = x * kWidth * 2, bottom = y * kHeight * 2; TBOX box(left, bottom, left + kWidth, bottom + kHeight); - ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, - BRT_TEXT, BTFT_NONE); + ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part_grid->InsertBBox(true, true, part); all_parts->push_back(part); } } } - void ClearParts(std::vector* all_parts) { + void ClearParts(std::vector *all_parts) { for (size_t i = 0; i < all_parts->size(); ++i) { (*all_parts)[i]->DeleteBoxes(); delete ((*all_parts)[i]); @@ -176,9 +169,9 @@ class EquationFinderTest : public testing::Test { } // Create a BLOBNBOX object with bounding box tbox, and add it into part. - void AddBlobIntoPart(const TBOX& tbox, ColPartition* part) { + void AddBlobIntoPart(const TBOX &tbox, ColPartition *part) { CHECK(part != nullptr); - BLOBNBOX* blob = new BLOBNBOX(); + BLOBNBOX *blob = new BLOBNBOX(); blob->set_bounding_box(tbox); part->AddBox(blob); } @@ -190,25 +183,24 @@ TEST_F(EquationFinderTest, IdentifySpecialText) { #else // TODO: missing equ_gt1.tif // Load Image. std::string imagefile = file::JoinPath(testdata_dir_, "equ_gt1.tif"); - Pix* pix_binary = pixRead(imagefile.c_str()); + Pix *pix_binary = pixRead(imagefile.c_str()); CHECK(pix_binary != nullptr && pixGetDepth(pix_binary) == 1); // Get components. BLOCK_LIST blocks; TO_BLOCK_LIST to_blocks; AddPageBlock(pix_binary, &blocks); - Textord* textord = tesseract_->mutable_textord(); + Textord *textord = tesseract_->mutable_textord(); textord->find_components(pix_binary, &blocks, &to_blocks); // Identify special texts from to_blocks. TO_BLOCK_IT to_block_it(&to_blocks); std::map stt_count; - for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); - to_block_it.forward()) { - TO_BLOCK* to_block = to_block_it.data(); + for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); to_block_it.forward()) { + TO_BLOCK *to_block = to_block_it.data(); BLOBNBOX_IT blob_it(&(to_block->blobs)); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { - BLOBNBOX* blob = blob_it.data(); + BLOBNBOX *blob = blob_it.data(); // blob->set_special_text_type(BSTT_NONE); equation_det_->RunIdentifySpecialText(blob, 0); tensorflow::gtl::InsertIfNotPresent(&stt_count, blob->special_text_type(), 0); @@ -266,42 +258,32 @@ TEST_F(EquationFinderTest, IsIndented) { // // part 5: ******** TBOX box1(0, 950, 999, 999); - ColPartition* part1 = - ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part1 = ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part_grid.InsertBBox(true, true, part1); TBOX box2(300, 920, 900, 940); - ColPartition* part2 = - ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part2 = ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part_grid.InsertBBox(true, true, part2); TBOX box3(0, 900, 600, 910); - ColPartition* part3 = - ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part3 = ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part_grid.InsertBBox(true, true, part3); TBOX box4(300, 890, 600, 899); - ColPartition* part4 = - ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part4 = ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part_grid.InsertBBox(true, true, part4); TBOX box5(300, 500, 900, 510); - ColPartition* part5 = - ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part5 = ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part_grid.InsertBBox(true, true, part5); // Test // part1 should be no indent. - EXPECT_EQ(EquationDetect::NO_INDENT, - equation_det_->RunIsIndented(&part_grid, part1)); + EXPECT_EQ(EquationDetect::NO_INDENT, equation_det_->RunIsIndented(&part_grid, part1)); // part2 should be left indent in terms of part1. - EXPECT_EQ(EquationDetect::LEFT_INDENT, - equation_det_->RunIsIndented(&part_grid, part2)); + EXPECT_EQ(EquationDetect::LEFT_INDENT, equation_det_->RunIsIndented(&part_grid, part2)); // part3 should be right indent. - EXPECT_EQ(EquationDetect::RIGHT_INDENT, - equation_det_->RunIsIndented(&part_grid, part3)); + EXPECT_EQ(EquationDetect::RIGHT_INDENT, equation_det_->RunIsIndented(&part_grid, part3)); // part4 should be both indented. - EXPECT_EQ(EquationDetect::BOTH_INDENT, - equation_det_->RunIsIndented(&part_grid, part4)); + EXPECT_EQ(EquationDetect::BOTH_INDENT, equation_det_->RunIsIndented(&part_grid, part4)); // part5 should be no indent because it is too far from part1. - EXPECT_EQ(EquationDetect::NO_INDENT, - equation_det_->RunIsIndented(&part_grid, part5)); + EXPECT_EQ(EquationDetect::NO_INDENT, equation_det_->RunIsIndented(&part_grid, part5)); // Release memory. part1->DeleteBoxes(); @@ -347,14 +329,10 @@ TEST_F(EquationFinderTest, IsNearSmallNeighbor) { TEST_F(EquationFinderTest, CheckSeedBlobsCount) { TBOX box(0, 950, 999, 999); - ColPartition* part1 = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); - ColPartition* part2 = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); - ColPartition* part3 = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); - ColPartition* part4 = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part1 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part2 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part3 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part4 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); // Part 1: 8 math, 0 digit, 20 total. equation_det_->AddMathDigitBlobs(8, 0, 20, part1); @@ -386,7 +364,7 @@ TEST_F(EquationFinderTest, CheckSeedBlobsCount) { TEST_F(EquationFinderTest, ComputeForegroundDensity) { // Create the pix with top half foreground, bottom half background. int width = 1024, height = 768; - Pix* pix = pixCreate(width, height, 1); + Pix *pix = pixCreate(width, height, 1); pixRasterop(pix, 0, 0, width, height / 2, PIX_SET, nullptr, 0, 0); TBOX box1(100, 0, 140, 140), box2(100, height / 2 - 20, 140, height / 2 + 20), box3(100, height - 40, 140, height); @@ -424,25 +402,20 @@ TEST_F(EquationFinderTest, CountAlignment) { } TEST_F(EquationFinderTest, ComputeCPsSuperBBox) { - Pix* pix = pixCreate(1001, 1001, 1); + Pix *pix = pixCreate(1001, 1001, 1); equation_det_->SetPixBinary(pix); ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000)); TBOX box1(0, 0, 999, 99); - ColPartition* part1 = - ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part1 = ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); TBOX box2(0, 100, 499, 199); - ColPartition* part2 = - ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part2 = ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); TBOX box3(500, 100, 999, 199); - ColPartition* part3 = - ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part3 = ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); TBOX box4(0, 200, 999, 299); - ColPartition* part4 = - ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part4 = ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); TBOX box5(0, 900, 999, 999); - ColPartition* part5 = - ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part5 = ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); // Add part1->part3 into part_grid and test. part_grid.InsertBBox(true, true, part1); @@ -476,8 +449,7 @@ TEST_F(EquationFinderTest, ComputeCPsSuperBBox) { TEST_F(EquationFinderTest, SplitCPHorLite) { TBOX box(0, 0, 999, 99); - ColPartition* part = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part->DeleteBoxes(); part->set_median_width(10); GenericVector splitted_boxes; @@ -494,10 +466,10 @@ TEST_F(EquationFinderTest, SplitCPHorLite) { // Add more blob and test. AddBlobIntoPart(TBOX(11, 0, 20, 60), part); - AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point. + AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point. AddBlobIntoPart(TBOX(100, 0, 110, 15), part); - AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point. - AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point. + AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point. + AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point. equation_det_->RunSplitCPHorLite(part, &splitted_boxes); // Verify. EXPECT_EQ(3, splitted_boxes.size()); @@ -511,11 +483,10 @@ TEST_F(EquationFinderTest, SplitCPHorLite) { TEST_F(EquationFinderTest, SplitCPHor) { TBOX box(0, 0, 999, 99); - ColPartition* part = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part->DeleteBoxes(); part->set_median_width(10); - GenericVector parts_splitted; + GenericVector parts_splitted; // Test an empty part. equation_det_->RunSplitCPHor(part, &parts_splitted); @@ -529,10 +500,10 @@ TEST_F(EquationFinderTest, SplitCPHor) { // Add more blob and test. AddBlobIntoPart(TBOX(11, 0, 20, 60), part); - AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point. + AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point. AddBlobIntoPart(TBOX(100, 0, 110, 15), part); - AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point. - AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point. + AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point. + AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point. equation_det_->RunSplitCPHor(part, &parts_splitted); // Verify. @@ -546,4 +517,4 @@ TEST_F(EquationFinderTest, SplitCPHor) { delete (part); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/fileio_test.cc b/unittest/fileio_test.cc index 00488918..d8e521d8 100644 --- a/unittest/fileio_test.cc +++ b/unittest/fileio_test.cc @@ -9,7 +9,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include #include @@ -29,8 +28,9 @@ TEST(FileTest, JoinPath) { TEST(OutputBufferTest, WriteString) { const int kMaxBufSize = 128; char buffer[kMaxBufSize]; - for (int i = 0; i < kMaxBufSize; ++i) buffer[i] = '\0'; - FILE* fp = tmpfile(); + for (int i = 0; i < kMaxBufSize; ++i) + buffer[i] = '\0'; + FILE *fp = tmpfile(); CHECK(fp != nullptr); std::unique_ptr output(new OutputBuffer(fp)); @@ -49,7 +49,7 @@ TEST(InputBufferTest, Read) { auto s = "Hello\n world!"; strncpy(buffer, s, kMaxBufSize); EXPECT_STREQ(s, buffer); - FILE* fp = tmpfile(); + FILE *fp = tmpfile(); CHECK(fp != nullptr); fwrite(buffer, strlen(s), 1, fp); rewind(fp); @@ -63,4 +63,4 @@ TEST(InputBufferTest, Read) { EXPECT_EQ(" world!", lines[1]); } -} // namespace +} // namespace tesseract diff --git a/unittest/fuzzers/fuzzer-api.cpp b/unittest/fuzzers/fuzzer-api.cpp index a1e4e7c4..41178240 100644 --- a/unittest/fuzzers/fuzzer-api.cpp +++ b/unittest/fuzzers/fuzzer-api.cpp @@ -1,28 +1,27 @@ -#include #include +#include -#include // for dirname -#include // for printf -#include // for std::getenv, std::setenv -#include // for std::string +#include // for dirname +#include // for printf +#include // for std::getenv, std::setenv +#include // for std::string #ifndef TESSERACT_FUZZER_WIDTH -#define TESSERACT_FUZZER_WIDTH 100 +# define TESSERACT_FUZZER_WIDTH 100 #endif #ifndef TESSERACT_FUZZER_HEIGHT -#define TESSERACT_FUZZER_HEIGHT 100 +# define TESSERACT_FUZZER_HEIGHT 100 #endif class BitReader { - private: - uint8_t const* data; +private: + uint8_t const *data; size_t size; size_t shift; - public: - BitReader(const uint8_t* data, size_t size) - : data(data), size(size), shift(0) {} +public: + BitReader(const uint8_t *data, size_t size) : data(data), size(size), shift(0) {} int Read(void) { if (size == 0) { @@ -42,9 +41,9 @@ class BitReader { } }; -static tesseract::TessBaseAPI* api = nullptr; +static tesseract::TessBaseAPI *api = nullptr; -extern "C" int LLVMFuzzerInitialize(int* /*pArgc*/, char*** pArgv) { +extern "C" int LLVMFuzzerInitialize(int * /*pArgc*/, char ***pArgv) { if (std::getenv("TESSDATA_PREFIX") == nullptr) { std::string binary_path = *pArgv[0]; const std::string filepath = dirname(&binary_path[0]); @@ -68,8 +67,8 @@ extern "C" int LLVMFuzzerInitialize(int* /*pArgc*/, char*** pArgv) { return 0; } -static PIX* createPix(BitReader& BR, const size_t width, const size_t height) { - Pix* pix = pixCreate(width, height, 1); +static PIX *createPix(BitReader &BR, const size_t width, const size_t height) { + Pix *pix = pixCreate(width, height, 1); if (pix == nullptr) { printf("pix creation failed\n"); @@ -85,14 +84,14 @@ static PIX* createPix(BitReader& BR, const size_t width, const size_t height) { return pix; } -extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { BitReader BR(data, size); auto pix = createPix(BR, TESSERACT_FUZZER_WIDTH, TESSERACT_FUZZER_HEIGHT); api->SetImage(pix); - char* outText = api->GetUTF8Text(); + char *outText = api->GetUTF8Text(); pixDestroy(&pix); delete[] outText; diff --git a/unittest/heap_test.cc b/unittest/heap_test.cc index 53ccbd86..ecae2563 100644 --- a/unittest/heap_test.cc +++ b/unittest/heap_test.cc @@ -9,7 +9,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "include_gunit.h" #include "doubleptr.h" @@ -26,15 +25,15 @@ int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0}; // The fixture for testing GenericHeap and DoublePtr. class HeapTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } - public: +public: virtual ~HeapTest(); // Pushes the test data onto both the heap and the KDVector. - void PushTestData(GenericHeap* heap, KDVector* v) { + void PushTestData(GenericHeap *heap, KDVector *v) { for (size_t i = 0; i < countof(test_data); ++i) { IntKDPair pair(test_data[i], i); heap->Push(&pair); @@ -43,7 +42,7 @@ class HeapTest : public testing::Test { } // Verifies that the data in the heap matches the vector (after sorting) by // popping everything off the heap. - void VerifyHeapVectorMatch(GenericHeap* heap, KDVector* v) { + void VerifyHeapVectorMatch(GenericHeap *heap, KDVector *v) { EXPECT_FALSE(heap->empty()); EXPECT_EQ(heap->size(), v->size()); // Sort the vector and check that the keys come out of the heap in the same @@ -153,12 +152,12 @@ TEST_F(HeapTest, RevalueTest) { // heap entry, wherever it may be. We can change its value via that pointer. // Without Reshuffle, that would be a terribly bad thing to do, as it violates // the heap invariant, making the heap corrupt. - PtrPair* pair_ptr = reinterpret_cast(v[0].data().OtherEnd()); + PtrPair *pair_ptr = reinterpret_cast(v[0].data().OtherEnd()); pair_ptr->key() = v[0].key(); heap.Reshuffle(pair_ptr); // Index 1 is 1. Change to 32767. v[1].key() = 32767; - pair_ptr = reinterpret_cast(v[1].data().OtherEnd()); + pair_ptr = reinterpret_cast(v[1].data().OtherEnd()); pair_ptr->key() = v[1].key(); heap.Reshuffle(pair_ptr); // After the changes, popping the heap should still match the sorted order @@ -199,4 +198,4 @@ TEST_F(HeapTest, DoublePtrTest) { EXPECT_TRUE(ptr3.OtherEnd() == nullptr); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/imagedata_test.cc b/unittest/imagedata_test.cc index 31bd2f24..272884c1 100644 --- a/unittest/imagedata_test.cc +++ b/unittest/imagedata_test.cc @@ -24,7 +24,7 @@ namespace tesseract { // Tests the caching mechanism of DocumentData/ImageData. class ImagedataTest : public ::testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); @@ -33,8 +33,7 @@ class ImagedataTest : public ::testing::Test { ImagedataTest() {} // Creates a fake DocumentData, writes it to a file, and returns the filename. - std::string MakeFakeDoc(int num_pages, unsigned doc_id, - std::vector* page_texts) { + std::string MakeFakeDoc(int num_pages, unsigned doc_id, std::vector *page_texts) { // The size of the fake images that we will use. const int kImageSize = 1048576; // Not using a real image here - just an array of zeros! We are just testing @@ -43,18 +42,16 @@ class ImagedataTest : public ::testing::Test { DocumentData write_doc("My document"); for (int p = 0; p < num_pages; ++p) { // Make some fake text that is different for each page and save it. - page_texts->push_back( - absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id)); + page_texts->push_back(absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id)); // Make an imagedata and put it in the document. - ImageData* imagedata = - ImageData::Build("noname", p, "eng", fake_image.data(), - fake_image.size(), (*page_texts)[p].c_str(), nullptr); + ImageData *imagedata = ImageData::Build("noname", p, "eng", fake_image.data(), + fake_image.size(), (*page_texts)[p].c_str(), nullptr); EXPECT_EQ(kImageSize, imagedata->MemoryUsed()); write_doc.AddPageToDocument(imagedata); } // Write it to a file. - std::string filename = file::JoinPath( - FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf")); + std::string filename = + file::JoinPath(FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf")); EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr)); return filename; } @@ -76,18 +73,16 @@ TEST_F(ImagedataTest, CachesProperly) { // the pages can still be read. for (int m = 0; kMemoryAllowances[m] > 0; ++m) { DocumentData read_doc("My document"); - EXPECT_TRUE( - read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr)); + EXPECT_TRUE(read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr)); LOG(ERROR) << "Allowance = " << kMemoryAllowances[m]; // Read the pages in a specific order. for (int p = 0; kPageReadOrder[p] >= 0; ++p) { int page = kPageReadOrder[p]; - const ImageData* imagedata = read_doc.GetPage(page); + const ImageData *imagedata = read_doc.GetPage(page); EXPECT_NE(nullptr, imagedata); - //EXPECT_NE(reinterpret_cast(nullptr), imagedata); + // EXPECT_NE(reinterpret_cast(nullptr), imagedata); // Check that this is the right page. - EXPECT_STREQ(page_texts[page].c_str(), - imagedata->transcription().c_str()); + EXPECT_STREQ(page_texts[page].c_str(), imagedata->transcription().c_str()); } } } @@ -112,20 +107,18 @@ TEST_F(ImagedataTest, CachesMultiDocs) { serial_cache.LoadDocuments(filenames, tesseract::CS_SEQUENTIAL, nullptr); for (int p = 0; p <= 21; ++p) { LOG(INFO) << "Page " << p; - const ImageData* robin_data = robin_cache.GetPageBySerial(p); - const ImageData* serial_data = serial_cache.GetPageBySerial(p); + const ImageData *robin_data = robin_cache.GetPageBySerial(p); + const ImageData *serial_data = serial_cache.GetPageBySerial(p); CHECK(robin_data != nullptr); CHECK(serial_data != nullptr); int robin_doc = p % kNumPages.size(); int robin_page = p / kNumPages.size() % kNumPages[robin_doc]; // Check that this is the right page. - EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(), - robin_data->transcription().c_str()); + EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(), robin_data->transcription().c_str()); int serial_doc = p / kNumPages[0] % kNumPages.size(); int serial_page = p % kNumPages[0] % kNumPages[serial_doc]; - EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(), - serial_data->transcription().c_str()); + EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(), serial_data->transcription().c_str()); } } -} // namespace. +} // namespace tesseract diff --git a/unittest/include_gunit.h b/unittest/include_gunit.h index ec508e7f..47914a04 100644 --- a/unittest/include_gunit.h +++ b/unittest/include_gunit.h @@ -13,16 +13,15 @@ #ifndef TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ #define TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ -#include "errcode.h" // for ASSERT_HOST -#include "fileio.h" // for tesseract::File -#include "log.h" // for LOG +#include "errcode.h" // for ASSERT_HOST +#include "fileio.h" // for tesseract::File #include "gtest/gtest.h" +#include "log.h" // for LOG -const char* FLAGS_test_tmpdir = "./tmp"; +const char *FLAGS_test_tmpdir = "./tmp"; class file : public tesseract::File { public: - static void MakeTmpdir() { #if defined(_WIN32) _mkdir(FLAGS_test_tmpdir); @@ -31,17 +30,18 @@ public: #endif } -// Create a file and write a string to it. - static bool WriteStringToFile(const std::string& contents, const std::string& filename) { + // Create a file and write a string to it. + static bool WriteStringToFile(const std::string &contents, const std::string &filename) { File::WriteStringToFileOrDie(contents, filename); return true; } - static bool GetContents(const std::string& filename, std::string* out, int) { + static bool GetContents(const std::string &filename, std::string *out, int) { return File::ReadFileToString(filename, out); } - static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) { + static bool SetContents(const std::string &name, const std::string &contents, + bool /*is_default*/) { return WriteStringToFile(contents, name); } @@ -49,26 +49,25 @@ public: return 0; } - static std::string JoinPath(const std::string& s1, const std::string& s2) { + static std::string JoinPath(const std::string &s1, const std::string &s2) { return tesseract::File::JoinPath(s1, s2); } - static std::string JoinPath(const std::string& s1, const std::string& s2, - const std::string& s3) { + static std::string JoinPath(const std::string &s1, const std::string &s2, const std::string &s3) { return JoinPath(JoinPath(s1, s2), s3); } }; // /usr/include/tensorflow/core/platform/default/logging.h defines the CHECK* macros. #if !defined(CHECK) -#define CHECK(condition) \ - if (!(condition)) \ +# define CHECK(condition) \ + if (!(condition)) \ LOG(FATAL) << "Check failed: " #condition " " -#define CHECK_EQ(test, value) CHECK((test) == (value)) -#define CHECK_GT(test, value) CHECK((test) > (value)) -#define CHECK_LT(test, value) CHECK((test) < (value)) -#define CHECK_LE(test, value) CHECK((test) <= (value)) -#define CHECK_OK(test) CHECK(test) +# define CHECK_EQ(test, value) CHECK((test) == (value)) +# define CHECK_GT(test, value) CHECK((test) > (value)) +# define CHECK_LT(test, value) CHECK((test) < (value)) +# define CHECK_LE(test, value) CHECK((test) <= (value)) +# define CHECK_OK(test) CHECK(test) #endif -#endif // TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ +#endif // TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ diff --git a/unittest/indexmapbidi_test.cc b/unittest/indexmapbidi_test.cc index bdd3c895..d4b884af 100644 --- a/unittest/indexmapbidi_test.cc +++ b/unittest/indexmapbidi_test.cc @@ -22,32 +22,34 @@ const int kPrimeLimit = 1000; namespace tesseract { class IndexMapBiDiTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); } - public: - std::string OutputNameToPath(const std::string& name) { +public: + std::string OutputNameToPath(const std::string &name) { return file::JoinPath(FLAGS_test_tmpdir, name); } // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes. - void ComputePrimes(IndexMapBiDi* map) { + void ComputePrimes(IndexMapBiDi *map) { map->Init(kPrimeLimit + 1, false); map->SetMap(2, true); // Set all the odds to true. - for (int i = 3; i <= kPrimeLimit; i += 2) map->SetMap(i, true); + for (int i = 3; i <= kPrimeLimit; i += 2) + map->SetMap(i, true); int factor_limit = static_cast(sqrt(1.0 + kPrimeLimit)); for (int f = 3; f <= factor_limit; f += 2) { if (map->SparseToCompact(f) >= 0) { - for (int m = 2; m * f <= kPrimeLimit; ++m) map->SetMap(f * m, false); + for (int m = 2; m * f <= kPrimeLimit; ++m) + map->SetMap(f * m, false); } } map->Setup(); } - void TestPrimes(const IndexMap& map) { + void TestPrimes(const IndexMap &map) { // Now all primes are mapped in the sparse map to their index. // According to Wikipedia, the 168th prime is 997, and it has compact // index 167 because we are indexing from 0. @@ -81,7 +83,7 @@ TEST_F(IndexMapBiDiTest, Primes) { TestPrimes(base_map); // Test file i/o too. std::string filename = OutputNameToPath("primesmap"); - FILE* fp = fopen(filename.c_str(), "wb"); + FILE *fp = fopen(filename.c_str(), "wb"); CHECK(fp != nullptr); EXPECT_TRUE(map.Serialize(fp)); fclose(fp); @@ -114,4 +116,4 @@ TEST_F(IndexMapBiDiTest, ManyToOne) { EXPECT_EQ(1, map.SparseToCompact(11)); } -} // namespace. +} // namespace tesseract diff --git a/unittest/intfeaturemap_test.cc b/unittest/intfeaturemap_test.cc index e95aa0c3..e6199012 100644 --- a/unittest/intfeaturemap_test.cc +++ b/unittest/intfeaturemap_test.cc @@ -23,15 +23,15 @@ const int kThetaBuckets = 13; namespace tesseract { class IntFeatureMapTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } - public: +public: // Expects that the given vector has contiguous integer values in the // range [start, end). - void ExpectContiguous(const GenericVector& v, int start, int end) { + void ExpectContiguous(const GenericVector &v, int start, int end) { for (int i = start; i < end; ++i) { EXPECT_EQ(i, v[i - start]); } @@ -49,8 +49,7 @@ TEST_F(IntFeatureMapTest, Exhaustive) { IntFeatureMap map; map.Init(space); int total_size = kIntFeatureExtent * kIntFeatureExtent * kIntFeatureExtent; - std::unique_ptr features( - new INT_FEATURE_STRUCT[total_size]); + std::unique_ptr features(new INT_FEATURE_STRUCT[total_size]); // Fill the features with every value. for (int y = 0; y < kIntFeatureExtent; ++y) { for (int x = 0; x < kIntFeatureExtent; ++x) { @@ -80,8 +79,7 @@ TEST_F(IntFeatureMapTest, Exhaustive) { int dtheta = kIntFeatureExtent / kThetaBuckets + 1; int bad_offsets = 0; for (int index = 0; index < total_buckets; ++index) { - for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps; - ++dir) { + for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps; ++dir) { int offset_index = map.OffsetFeature(index, dir); if (dir == 0) { EXPECT_EQ(index, offset_index); @@ -112,11 +110,9 @@ TEST_F(IntFeatureMapTest, Exhaustive) { // Has no effect on index features. EXPECT_EQ(total_size, index_features.size()); misses = map.MapIndexedFeatures(index_features, &map_features); - int expected_misses = (kIntFeatureExtent / kXBuckets) * - (kIntFeatureExtent / kYBuckets) * + int expected_misses = (kIntFeatureExtent / kXBuckets) * (kIntFeatureExtent / kYBuckets) * (kIntFeatureExtent / kThetaBuckets + 1); - expected_misses += (kIntFeatureExtent / kXBuckets) * - (kIntFeatureExtent / kYBuckets + 1) * + expected_misses += (kIntFeatureExtent / kXBuckets) * (kIntFeatureExtent / kYBuckets + 1) * (kIntFeatureExtent / kThetaBuckets); EXPECT_EQ(expected_misses, misses); EXPECT_EQ(total_buckets - 2, map_features.size()); @@ -126,4 +122,4 @@ TEST_F(IntFeatureMapTest, Exhaustive) { #endif } -} // namespace. +} // namespace tesseract diff --git a/unittest/intsimdmatrix_test.cc b/unittest/intsimdmatrix_test.cc index cdfbaa2c..49fbd1c1 100644 --- a/unittest/intsimdmatrix_test.cc +++ b/unittest/intsimdmatrix_test.cc @@ -15,10 +15,10 @@ /////////////////////////////////////////////////////////////////////// #include "intsimdmatrix.h" -#include -#include #include #include +#include +#include #include "include_gunit.h" #include "matrix.h" #include "simddetect.h" @@ -27,7 +27,7 @@ namespace tesseract { class IntSimdMatrixTest : public ::testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } @@ -43,7 +43,7 @@ class IntSimdMatrixTest : public ::testing::Test { return a; } // Makes a random input vector of the given size, with rounding up. - std::vector RandomVector(int size, const IntSimdMatrix& matrix) { + std::vector RandomVector(int size, const IntSimdMatrix &matrix) { int rounded_size = matrix.RoundInputs(size); std::vector v(rounded_size, 0); for (int i = 0; i < size; ++i) { @@ -60,7 +60,7 @@ class IntSimdMatrixTest : public ::testing::Test { return v; } // Tests a range of sizes and compares the results against the generic version. - void ExpectEqualResults(const IntSimdMatrix& matrix) { + void ExpectEqualResults(const IntSimdMatrix &matrix) { double total = 0.0; for (int num_out = 1; num_out < 130; ++num_out) { for (int num_in = 1; num_in < 130; ++num_in) { @@ -80,8 +80,8 @@ class IntSimdMatrixTest : public ::testing::Test { matrix.Init(w, shaped_wi, rounded_num_out); scales.reserve(rounded_num_out); if (matrix.matrixDotVectorFunction) { - matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0], - &scales[0], &u[0], &test_result[0]); + matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0], &scales[0], &u[0], + &test_result[0]); } else { IntSimdMatrix::MatrixDotVector(w, scales, u.data(), test_result.data()); } @@ -132,4 +132,4 @@ TEST_F(IntSimdMatrixTest, AVX2) { #endif } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/lang_model_test.cc b/unittest/lang_model_test.cc index b059c18c..5b1c3735 100644 --- a/unittest/lang_model_test.cc +++ b/unittest/lang_model_test.cc @@ -9,21 +9,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // for std::string +#include // for std::string #include "absl/strings/str_cat.h" -#include "gmock/gmock.h" // for testing::ElementsAreArray +#include "gmock/gmock.h" // for testing::ElementsAreArray #include "include_gunit.h" #include "lang_model_helpers.h" -#include "log.h" // for LOG +#include "log.h" // for LOG #include "lstmtrainer.h" #include "unicharset_training_utils.h" namespace tesseract { -std::string TestDataNameToPath(const std::string& name) { +std::string TestDataNameToPath(const std::string &name) { return file::JoinPath(TESTING_DIR, name); } @@ -48,22 +48,19 @@ TEST(LangModelTest, AddACharacter) { bool pass_through_recoder = false; std::vector words, puncs, numbers; // If these reads fail, we get a warning message and an empty list of words. - ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr) - .split('\n', &words); + ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr).split('\n', &words); EXPECT_GT(words.size(), 0); ReadFile(file::JoinPath(eng_dir, "eng.punc"), nullptr).split('\n', &puncs); EXPECT_GT(puncs.size(), 0); - ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr) - .split('\n', &numbers); + ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr).split('\n', &numbers); EXPECT_GT(numbers.size(), 0); bool lang_is_rtl = false; // Generate the traineddata file. - EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, - lang1, pass_through_recoder, words, puncs, - numbers, lang_is_rtl, nullptr, nullptr)); + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1, + pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr, + nullptr)); // Init a trainer with it, and encode kTestString. - std::string traineddata1 = - file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); + std::string traineddata1 = file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); LSTMTrainer trainer1; trainer1.InitCharSet(traineddata1); std::vector labels1; @@ -75,18 +72,15 @@ TEST(LangModelTest, AddACharacter) { // Add a new character to the unicharset and try again. int size_before = unicharset.size(); unicharset.unichar_insert("₹"); - SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, - &unicharset); + SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset); EXPECT_EQ(size_before + 1, unicharset.size()); // Generate the traineddata file. std::string lang2 = "extended"; - EXPECT_EQ(EXIT_SUCCESS, - CombineLangModel(unicharset, script_dir, version_str, output_dir, - lang2, pass_through_recoder, words, puncs, numbers, - lang_is_rtl, nullptr, nullptr)); + EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2, + pass_through_recoder, words, puncs, numbers, lang_is_rtl, + nullptr, nullptr)); // Init a trainer with it, and encode kTestString. - std::string traineddata2 = - file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); + std::string traineddata2 = file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); LSTMTrainer trainer2; trainer2.InitCharSet(traineddata2); std::vector labels2; @@ -114,8 +108,7 @@ TEST(LangModelTest, AddACharacter) { else labels1_v[i] = labels1[i]; } - EXPECT_THAT(labels1_v, - testing::ElementsAreArray(&labels2[0], labels2.size())); + EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size())); // To make sure we we are not cheating somehow, we can now encode the Rupee // symbol, which we could not do before. EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); @@ -140,22 +133,19 @@ TEST(LangModelTest, AddACharacterHindi) { bool pass_through_recoder = false; std::vector words, puncs, numbers; // If these reads fail, we get a warning message and an empty list of words. - ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr) - .split('\n', &words); + ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr).split('\n', &words); EXPECT_GT(words.size(), 0); ReadFile(file::JoinPath(hin_dir, "hin.punc"), nullptr).split('\n', &puncs); EXPECT_GT(puncs.size(), 0); - ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr) - .split('\n', &numbers); + ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr).split('\n', &numbers); EXPECT_GT(numbers.size(), 0); bool lang_is_rtl = false; // Generate the traineddata file. - EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, - lang1, pass_through_recoder, words, puncs, - numbers, lang_is_rtl, nullptr, nullptr)); + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1, + pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr, + nullptr)); // Init a trainer with it, and encode kTestString. - std::string traineddata1 = - file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); + std::string traineddata1 = file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); LSTMTrainer trainer1; trainer1.InitCharSet(traineddata1); std::vector labels1; @@ -167,18 +157,15 @@ TEST(LangModelTest, AddACharacterHindi) { // Add a new character to the unicharset and try again. int size_before = unicharset.size(); unicharset.unichar_insert("₹"); - SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, - &unicharset); + SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset); EXPECT_EQ(size_before + 1, unicharset.size()); // Generate the traineddata file. std::string lang2 = "extendedhin"; - EXPECT_EQ(EXIT_SUCCESS, - CombineLangModel(unicharset, script_dir, version_str, output_dir, - lang2, pass_through_recoder, words, puncs, numbers, - lang_is_rtl, nullptr, nullptr)); + EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2, + pass_through_recoder, words, puncs, numbers, lang_is_rtl, + nullptr, nullptr)); // Init a trainer with it, and encode kTestString. - std::string traineddata2 = - file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); + std::string traineddata2 = file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); LSTMTrainer trainer2; trainer2.InitCharSet(traineddata2); std::vector labels2; @@ -206,12 +193,11 @@ TEST(LangModelTest, AddACharacterHindi) { else labels1_v[i] = labels1[i]; } - EXPECT_THAT(labels1_v, - testing::ElementsAreArray(&labels2[0], labels2.size())); + EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size())); // To make sure we we are not cheating somehow, we can now encode the Rupee // symbol, which we could not do before. EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2)); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/layout_test.cc b/unittest/layout_test.cc index 52892ee4..788aeb9c 100644 --- a/unittest/layout_test.cc +++ b/unittest/layout_test.cc @@ -16,57 +16,59 @@ #include #include +#include #include "coutln.h" -#include "log.h" // for LOG +#include "log.h" // for LOG #include "mutableiterator.h" -#include "ocrblock.h" // for class BLOCK +#include "ocrblock.h" // for class BLOCK #include "pageres.h" #include "polyblk.h" -#include #include "stepblob.h" namespace tesseract { /** String name for each block type. Keep in sync with PolyBlockType. */ -static const char* kPolyBlockNames[] = { - "Unknown", - "Flowing Text", - "Heading Text", - "Pullout Text", - "Equation", - "Inline Equation", - "Table", - "Vertical Text", - "Caption Text", - "Flowing Image", - "Heading Image", - "Pullout Image", - "Horizontal Line", - "Vertical Line", - "Noise", - "" // End marker for testing that sizes match. +static const char *kPolyBlockNames[] = { + "Unknown", + "Flowing Text", + "Heading Text", + "Pullout Text", + "Equation", + "Inline Equation", + "Table", + "Vertical Text", + "Caption Text", + "Flowing Image", + "Heading Image", + "Pullout Image", + "Horizontal Line", + "Vertical Line", + "Noise", + "" // End marker for testing that sizes match. }; -const char* kStrings8087_054[] = { - "dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr}; -const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT, - PT_PULLOUT_IMAGE, PT_CAPTION_TEXT, - PT_FLOWING_TEXT}; +const char *kStrings8087_054[] = {"dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr}; +const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT, PT_PULLOUT_IMAGE, + PT_CAPTION_TEXT, PT_FLOWING_TEXT}; // The fixture for testing Tesseract. class LayoutTest : public testing::Test { - protected: - std::string TestDataNameToPath(const std::string& name) { +protected: + std::string TestDataNameToPath(const std::string &name) { return file::JoinPath(TESTING_DIR, "/" + name); } std::string TessdataPath() { return file::JoinPath(TESSDATA_DIR, ""); } - LayoutTest() { src_pix_ = nullptr; } - ~LayoutTest() { pixDestroy(&src_pix_); } + LayoutTest() { + src_pix_ = nullptr; + } + ~LayoutTest() { + pixDestroy(&src_pix_); + } - void SetImage(const char* filename, const char* lang) { + void SetImage(const char *filename, const char *lang) { pixDestroy(&src_pix_); src_pix_ = pixRead(TestDataNameToPath(filename).c_str()); api_.Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY); @@ -79,24 +81,23 @@ class LayoutTest : public testing::Test { // allowing for other blocks in between. // An empty string should match an image block, and a nullptr string // indicates the end of the array. - void VerifyBlockTextOrder(const char* strings[], const PolyBlockType* blocks, - ResultIterator* it) { + void VerifyBlockTextOrder(const char *strings[], const PolyBlockType *blocks, + ResultIterator *it) { it->Begin(); int string_index = 0; int block_index = 0; do { - char* block_text = it->GetUTF8Text(tesseract::RIL_BLOCK); + char *block_text = it->GetUTF8Text(tesseract::RIL_BLOCK); if (block_text != nullptr && it->BlockType() == blocks[string_index] && strstr(block_text, strings[string_index]) != nullptr) { - LOG(INFO) << "Found string " << strings[string_index] - << " in block " << block_index - << " of type " << kPolyBlockNames[blocks[string_index]] << "\n"; + LOG(INFO) << "Found string " << strings[string_index] << " in block " << block_index + << " of type " << kPolyBlockNames[blocks[string_index]] << "\n"; // Found this one. ++string_index; - } else if (it->BlockType() == blocks[string_index] && - block_text == nullptr && strings[string_index][0] == '\0') { - LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]] - << " at block " << block_index << "\n"; + } else if (it->BlockType() == blocks[string_index] && block_text == nullptr && + strings[string_index][0] == '\0') { + LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]] << " at block " + << block_index << "\n"; // Found this one. ++string_index; } else { @@ -104,7 +105,8 @@ class LayoutTest : public testing::Test { } delete[] block_text; ++block_index; - if (strings[string_index] == nullptr) break; + if (strings[string_index] == nullptr) + break; } while (it->Next(tesseract::RIL_BLOCK)); EXPECT_TRUE(strings[string_index] == nullptr); } @@ -114,7 +116,7 @@ class LayoutTest : public testing::Test { // If a block overlaps its predecessor in x, then it must be below it. // otherwise, if the block is not below its predecessor, then it must // be to the left of it if right_to_left is true, or to the right otherwise. - void VerifyRoughBlockOrder(bool right_to_left, ResultIterator* it) { + void VerifyRoughBlockOrder(bool right_to_left, ResultIterator *it) { int prev_left = 0; int prev_right = 0; int prev_bottom = 0; @@ -122,8 +124,7 @@ class LayoutTest : public testing::Test { do { int left, top, right, bottom; if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) && - PTIsTextType(it->BlockType()) && right - left > 800 && - bottom - top > 200) { + PTIsTextType(it->BlockType()) && right - left > 800 && bottom - top > 200) { if (prev_right > prev_left) { if (std::min(right, prev_right) > std::max(left, prev_left)) { EXPECT_GE(top, prev_bottom) << "Overlapping block should be below"; @@ -145,15 +146,14 @@ class LayoutTest : public testing::Test { // Tests that every blob assigned to the biggest text blocks is contained // fully within its block by testing that the block polygon winds around // the center of the bounding boxes of the outlines in the blob. - void VerifyTotalContainment(int winding_target, MutableIterator* it) { + void VerifyTotalContainment(int winding_target, MutableIterator *it) { it->Begin(); do { int left, top, right, bottom; if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) && - PTIsTextType(it->BlockType()) && right - left > 800 && - bottom - top > 200) { - const PAGE_RES_IT* pr_it = it->PageResIt(); - POLY_BLOCK* pb = pr_it->block()->block->pdblk.poly_block(); + PTIsTextType(it->BlockType()) && right - left > 800 && bottom - top > 200) { + const PAGE_RES_IT *pr_it = it->PageResIt(); + POLY_BLOCK *pb = pr_it->block()->block->pdblk.poly_block(); CHECK(pb != nullptr); FCOORD skew = pr_it->block()->block->skew(); EXPECT_GT(skew.x(), 0.0f); @@ -161,18 +161,17 @@ class LayoutTest : public testing::Test { // Iterate the words in the block. MutableIterator word_it = *it; do { - const PAGE_RES_IT* w_it = word_it.PageResIt(); + const PAGE_RES_IT *w_it = word_it.PageResIt(); // Iterate the blobs in the word. C_BLOB_IT b_it(w_it->word()->word->cblob_list()); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { - C_BLOB* blob = b_it.data(); + C_BLOB *blob = b_it.data(); // Iterate the outlines in the blob. C_OUTLINE_IT ol_it(blob->out_list()); for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { - C_OUTLINE* ol = ol_it.data(); + C_OUTLINE *ol = ol_it.data(); TBOX box = ol->bounding_box(); - ICOORD middle((box.left() + box.right()) / 2, - (box.top() + box.bottom()) / 2); + ICOORD middle((box.left() + box.right()) / 2, (box.top() + box.bottom()) / 2); EXPECT_EQ(winding_target, pb->winding_number(middle)); } } @@ -182,7 +181,7 @@ class LayoutTest : public testing::Test { } while (it->Next(tesseract::RIL_BLOCK)); } - Pix* src_pix_; + Pix *src_pix_; std::string ocr_text_; tesseract::TessBaseAPI api_; }; @@ -202,7 +201,7 @@ TEST_F(LayoutTest, UNLV8087_054) { // Just run recognition. EXPECT_EQ(api_.Recognize(nullptr), 0); // Check iterator position. - tesseract::ResultIterator* it = api_.GetIterator(); + tesseract::ResultIterator *it = api_.GetIterator(); VerifyBlockTextOrder(kStrings8087_054, kBlocks8087_054, it); delete it; } @@ -214,7 +213,7 @@ TEST_F(LayoutTest, HebrewOrderingAndSkew) { SetImage("hebrew.png", "eng"); // Just run recognition. EXPECT_EQ(api_.Recognize(nullptr), 0); - tesseract::MutableIterator* it = api_.GetMutableIterator(); + tesseract::MutableIterator *it = api_.GetMutableIterator(); // In eng mode, block order should not be RTL. VerifyRoughBlockOrder(false, it); VerifyTotalContainment(1, it); @@ -231,4 +230,4 @@ TEST_F(LayoutTest, HebrewOrderingAndSkew) { delete it; } -} // namespace +} // namespace tesseract diff --git a/unittest/ligature_table_test.cc b/unittest/ligature_table_test.cc index b2a0bcdf..44ada686 100644 --- a/unittest/ligature_table_test.cc +++ b/unittest/ligature_table_test.cc @@ -9,10 +9,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ligature_table.h" #include "commandlineflags.h" #include "fileio.h" #include "include_gunit.h" -#include "ligature_table.h" #include "pango_font_info.h" namespace tesseract { @@ -25,11 +25,11 @@ const char kEngLigatureText[] = "fidelity effigy ſteep"; // ligature. The test Verdana font does not support the "ffi" or "ſt" ligature. const char kRenderableEngLigatureText[] = "fidelity effigy ſteep"; -static PangoFontMap* font_map; +static PangoFontMap *font_map; class LigatureTableTest : public ::testing::Test { - protected: - void SetUp() override { +protected: + void SetUp() override { lig_table_ = LigatureTable::Get(); if (!font_map) { font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); @@ -46,7 +46,7 @@ class LigatureTableTest : public ::testing::Test { file::MakeTmpdir(); PangoFontInfo::SoftInitFontConfig(); // init early } - LigatureTable* lig_table_; + LigatureTable *lig_table_; }; TEST_F(LigatureTableTest, DoesFillLigatureTables) { @@ -55,15 +55,14 @@ TEST_F(LigatureTableTest, DoesFillLigatureTables) { } TEST_F(LigatureTableTest, DoesAddLigatures) { - EXPECT_STREQ(kEngLigatureText, - lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str()); + EXPECT_STREQ(kEngLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str()); } TEST_F(LigatureTableTest, DoesAddLigaturesWithSupportedFont) { PangoFontInfo font; EXPECT_TRUE(font.ParseFontDescriptionName("Verdana")); -printf("1:%s\n", kRenderableEngLigatureText); -printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); + printf("1:%s\n", kRenderableEngLigatureText); + printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); EXPECT_STREQ(kRenderableEngLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); } @@ -71,41 +70,34 @@ printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); TEST_F(LigatureTableTest, DoesNotAddLigaturesWithUnsupportedFont) { PangoFontInfo font; EXPECT_TRUE(font.ParseFontDescriptionName("Lohit Hindi")); - EXPECT_STREQ(kEngNonLigatureText, - lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); + EXPECT_STREQ(kEngNonLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); } TEST_F(LigatureTableTest, DoesRemoveLigatures) { - EXPECT_STREQ(kEngNonLigatureText, - lig_table_->RemoveLigatures(kEngLigatureText).c_str()); + EXPECT_STREQ(kEngNonLigatureText, lig_table_->RemoveLigatures(kEngLigatureText).c_str()); } TEST_F(LigatureTableTest, TestCustomLigatures) { - const char* kTestCases[] = { + const char *kTestCases[] = { "act", "a\uE003", "publiſh", "publi\uE006", "ſince", "\uE007nce", "aſleep", "a\uE008eep", "neceſſary", "nece\uE009ary", }; for (size_t i = 0; i < countof(kTestCases); i += 2) { - EXPECT_STREQ(kTestCases[i + 1], - lig_table_->AddLigatures(kTestCases[i], nullptr).c_str()); - EXPECT_STREQ(kTestCases[i], - lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str()); - EXPECT_STREQ(kTestCases[i], - lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str()); + EXPECT_STREQ(kTestCases[i + 1], lig_table_->AddLigatures(kTestCases[i], nullptr).c_str()); + EXPECT_STREQ(kTestCases[i], lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str()); + EXPECT_STREQ(kTestCases[i], lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str()); } } TEST_F(LigatureTableTest, TestRemovesCustomLigatures) { - const char* kTestCases[] = { + const char *kTestCases[] = { "fiction", "fi\uE003ion", "fiction", }; for (size_t i = 0; i < countof(kTestCases); i += 3) { - EXPECT_STREQ(kTestCases[i + 1], - lig_table_->AddLigatures(kTestCases[i], nullptr).c_str()); - EXPECT_STREQ(kTestCases[i + 2], - lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str()); + EXPECT_STREQ(kTestCases[i + 1], lig_table_->AddLigatures(kTestCases[i], nullptr).c_str()); + EXPECT_STREQ(kTestCases[i + 2], lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str()); } } -} // namespace +} // namespace tesseract diff --git a/unittest/linlsq_test.cc b/unittest/linlsq_test.cc index 2ca0ea9e..fb459c80 100644 --- a/unittest/linlsq_test.cc +++ b/unittest/linlsq_test.cc @@ -16,35 +16,35 @@ namespace tesseract { class LLSQTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } - public: +public: void TearDown() {} - void ExpectCorrectLine(const LLSQ& llsq, double m, double c, double rms, - double pearson, double tolerance) { + void ExpectCorrectLine(const LLSQ &llsq, double m, double c, double rms, double pearson, + double tolerance) { EXPECT_NEAR(m, llsq.m(), tolerance); EXPECT_NEAR(c, llsq.c(llsq.m()), tolerance); EXPECT_NEAR(rms, llsq.rms(llsq.m(), llsq.c(llsq.m())), tolerance); EXPECT_NEAR(pearson, llsq.pearson(), tolerance); } - FCOORD PtsMean(const std::vector& pts) { + FCOORD PtsMean(const std::vector &pts) { FCOORD total(0, 0); - for (const auto& p : pts) { + for (const auto &p : pts) { total += p; } return (pts.size() > 0) ? total / pts.size() : total; } - void VerifyRmsOrth(const std::vector& pts, const FCOORD& orth) { + void VerifyRmsOrth(const std::vector &pts, const FCOORD &orth) { LLSQ llsq; FCOORD xavg = PtsMean(pts); FCOORD nvec = !orth; nvec.normalise(); double expected_answer = 0; - for (const auto& p : pts) { + for (const auto &p : pts) { llsq.add(p.x(), p.y()); double dot = nvec % (p - xavg); expected_answer += dot * dot; @@ -53,8 +53,8 @@ class LLSQTest : public testing::Test { expected_answer = sqrt(expected_answer); EXPECT_NEAR(expected_answer, llsq.rms_orth(orth), 0.0001); } - void ExpectCorrectVector(const LLSQ& llsq, FCOORD correct_mean_pt, - FCOORD correct_vector, float tolerance) { + void ExpectCorrectVector(const LLSQ &llsq, FCOORD correct_mean_pt, FCOORD correct_vector, + float tolerance) { FCOORD mean_pt = llsq.mean_point(); FCOORD vector = llsq.vector_fit(); EXPECT_NEAR(correct_mean_pt.x(), mean_pt.x(), tolerance); @@ -71,8 +71,7 @@ TEST_F(LLSQTest, BasicLines) { llsq.add(2.0, 2.0); ExpectCorrectLine(llsq, 1.0, 0.0, 0.0, 1.0, 1e-6); float half_root_2 = sqrt(2.0) / 2.0f; - ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f), - FCOORD(half_root_2, half_root_2), 1e-6); + ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f), FCOORD(half_root_2, half_root_2), 1e-6); llsq.remove(2.0, 2.0); llsq.add(1.0, 2.0); llsq.add(10.0, 1.0); @@ -115,4 +114,4 @@ TEST_F(LLSQTest, RmsOrthWorksAsIntended) { VerifyRmsOrth(pts, FCOORD(2, 1)); } -} // namespace. +} // namespace tesseract diff --git a/unittest/list_test.cc b/unittest/list_test.cc index e6a2bf1d..bb86bcd0 100644 --- a/unittest/list_test.cc +++ b/unittest/list_test.cc @@ -11,17 +11,17 @@ #include "include_gunit.h" #if 0 // TODO: add tests for CLIST -#include "clst.h" +# include "clst.h" #endif #include "elst.h" #if 0 // TODO: add tests for ELIST2 -#include "elst2.h" +# include "elst2.h" #endif namespace tesseract { class ListTest : public ::testing::Test { - protected: +protected: void SetUp() override { static std::locale system_locale(""); std::locale::global(system_locale); @@ -29,9 +29,8 @@ class ListTest : public ::testing::Test { }; class Elst : public ELIST_LINK { - public: - Elst(unsigned n) : value(n) { - } +public: + Elst(unsigned n) : value(n) {} unsigned value; }; @@ -42,22 +41,22 @@ TEST_F(ListTest, TestELIST) { Elst_LIST list; auto it = ELIST_ITERATOR(&list); for (unsigned i = 0; i < 10; i++) { - auto* elst = new Elst(i); - //EXPECT_TRUE(elst->empty()); - //EXPECT_EQ(elst->length(), 0); + auto *elst = new Elst(i); + // EXPECT_TRUE(elst->empty()); + // EXPECT_EQ(elst->length(), 0); it.add_to_end(elst); } it.move_to_first(); unsigned n = 0; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - auto* elst = reinterpret_cast(it.data()); + auto *elst = reinterpret_cast(it.data()); EXPECT_EQ(elst->value, n); n++; } it.forward(); n++; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { - auto* elst = reinterpret_cast(it.extract()); + auto *elst = reinterpret_cast(it.extract()); EXPECT_EQ(elst->value, n % 10); n++; delete elst; @@ -65,4 +64,4 @@ TEST_F(ListTest, TestELIST) { // TODO: add more tests for ELIST } -} // namespace tesseract. +} // namespace tesseract. diff --git a/unittest/loadlang_test.cc b/unittest/loadlang_test.cc index ba7a9f6d..a4cf24f9 100644 --- a/unittest/loadlang_test.cc +++ b/unittest/loadlang_test.cc @@ -16,236 +16,556 @@ // limitations under the License. /////////////////////////////////////////////////////////////////////// -#include // std::unique_ptr -#include #include +#include +#include // std::unique_ptr #include "include_gunit.h" namespace tesseract { class QuickTest : public testing::Test { - protected: - virtual void SetUp() { start_time_ = time(nullptr); } +protected: + virtual void SetUp() { + start_time_ = time(nullptr); + } virtual void TearDown() { const time_t end_time = time(nullptr); EXPECT_TRUE(end_time - start_time_ <= 25) - << "The test took too long - " - << ::testing::PrintToString(end_time - start_time_); + << "The test took too long - " << ::testing::PrintToString(end_time - start_time_); } time_t start_time_; }; -void LangLoader(const char* lang, const char* tessdatadir) { +void LangLoader(const char *lang, const char *tessdatadir) { std::unique_ptr api(new tesseract::TessBaseAPI()); - ASSERT_FALSE(api->Init(tessdatadir, lang)) - << "Could not initialize tesseract for $lang."; + ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract for $lang."; api->End(); } // For all languages -class LoadLanguage : public QuickTest, - public ::testing::WithParamInterface {}; +class LoadLanguage : public QuickTest, public ::testing::WithParamInterface {}; -TEST_P(LoadLanguage, afr) { LangLoader("afr", GetParam()); } -TEST_P(LoadLanguage, amh) { LangLoader("amh", GetParam()); } -TEST_P(LoadLanguage, ara) { LangLoader("ara", GetParam()); } -TEST_P(LoadLanguage, asm) { LangLoader("asm", GetParam()); } -TEST_P(LoadLanguage, aze) { LangLoader("aze", GetParam()); } -TEST_P(LoadLanguage, aze_cyrl) { LangLoader("aze_cyrl", GetParam()); } -TEST_P(LoadLanguage, bel) { LangLoader("bel", GetParam()); } -TEST_P(LoadLanguage, ben) { LangLoader("ben", GetParam()); } -TEST_P(LoadLanguage, bod) { LangLoader("bod", GetParam()); } -TEST_P(LoadLanguage, bos) { LangLoader("bos", GetParam()); } -TEST_P(LoadLanguage, bre) { LangLoader("bre", GetParam()); } -TEST_P(LoadLanguage, bul) { LangLoader("bul", GetParam()); } -TEST_P(LoadLanguage, cat) { LangLoader("cat", GetParam()); } -TEST_P(LoadLanguage, ceb) { LangLoader("ceb", GetParam()); } -TEST_P(LoadLanguage, ces) { LangLoader("ces", GetParam()); } -TEST_P(LoadLanguage, chi_sim) { LangLoader("chi_sim", GetParam()); } -TEST_P(LoadLanguage, chi_sim_vert) { LangLoader("chi_sim_vert", GetParam()); } -TEST_P(LoadLanguage, chi_tra) { LangLoader("chi_tra", GetParam()); } -TEST_P(LoadLanguage, chi_tra_vert) { LangLoader("chi_tra_vert", GetParam()); } -TEST_P(LoadLanguage, chr) { LangLoader("chr", GetParam()); } -TEST_P(LoadLanguage, cos) { LangLoader("cos", GetParam()); } -TEST_P(LoadLanguage, cym) { LangLoader("cym", GetParam()); } -TEST_P(LoadLanguage, dan) { LangLoader("dan", GetParam()); } -TEST_P(LoadLanguage, deu) { LangLoader("deu", GetParam()); } -TEST_P(LoadLanguage, div) { LangLoader("div", GetParam()); } -TEST_P(LoadLanguage, dzo) { LangLoader("dzo", GetParam()); } -TEST_P(LoadLanguage, ell) { LangLoader("ell", GetParam()); } -TEST_P(LoadLanguage, eng) { LangLoader("eng", GetParam()); } -TEST_P(LoadLanguage, enm) { LangLoader("enm", GetParam()); } -TEST_P(LoadLanguage, epo) { LangLoader("epo", GetParam()); } -TEST_P(LoadLanguage, est) { LangLoader("est", GetParam()); } -TEST_P(LoadLanguage, eus) { LangLoader("eus", GetParam()); } -TEST_P(LoadLanguage, fao) { LangLoader("fao", GetParam()); } -TEST_P(LoadLanguage, fas) { LangLoader("fas", GetParam()); } -TEST_P(LoadLanguage, fil) { LangLoader("fil", GetParam()); } -TEST_P(LoadLanguage, fin) { LangLoader("fin", GetParam()); } -TEST_P(LoadLanguage, fra) { LangLoader("fra", GetParam()); } -TEST_P(LoadLanguage, frk) { LangLoader("frk", GetParam()); } -TEST_P(LoadLanguage, frm) { LangLoader("frm", GetParam()); } -TEST_P(LoadLanguage, fry) { LangLoader("fry", GetParam()); } -TEST_P(LoadLanguage, gla) { LangLoader("gla", GetParam()); } -TEST_P(LoadLanguage, gle) { LangLoader("gle", GetParam()); } -TEST_P(LoadLanguage, glg) { LangLoader("glg", GetParam()); } -TEST_P(LoadLanguage, grc) { LangLoader("grc", GetParam()); } -TEST_P(LoadLanguage, guj) { LangLoader("guj", GetParam()); } -TEST_P(LoadLanguage, hat) { LangLoader("hat", GetParam()); } -TEST_P(LoadLanguage, heb) { LangLoader("heb", GetParam()); } -TEST_P(LoadLanguage, hin) { LangLoader("hin", GetParam()); } -TEST_P(LoadLanguage, hrv) { LangLoader("hrv", GetParam()); } -TEST_P(LoadLanguage, hun) { LangLoader("hun", GetParam()); } -TEST_P(LoadLanguage, hye) { LangLoader("hye", GetParam()); } -TEST_P(LoadLanguage, iku) { LangLoader("iku", GetParam()); } -TEST_P(LoadLanguage, ind) { LangLoader("ind", GetParam()); } -TEST_P(LoadLanguage, isl) { LangLoader("isl", GetParam()); } -TEST_P(LoadLanguage, ita) { LangLoader("ita", GetParam()); } -TEST_P(LoadLanguage, ita_old) { LangLoader("ita_old", GetParam()); } -TEST_P(LoadLanguage, jav) { LangLoader("jav", GetParam()); } -TEST_P(LoadLanguage, jpn) { LangLoader("jpn", GetParam()); } -TEST_P(LoadLanguage, jpn_vert) { LangLoader("jpn_vert", GetParam()); } -TEST_P(LoadLanguage, kan) { LangLoader("kan", GetParam()); } -TEST_P(LoadLanguage, kat) { LangLoader("kat", GetParam()); } -TEST_P(LoadLanguage, kat_old) { LangLoader("kat_old", GetParam()); } -TEST_P(LoadLanguage, kaz) { LangLoader("kaz", GetParam()); } -TEST_P(LoadLanguage, khm) { LangLoader("khm", GetParam()); } -TEST_P(LoadLanguage, kir) { LangLoader("kir", GetParam()); } +TEST_P(LoadLanguage, afr) { + LangLoader("afr", GetParam()); +} +TEST_P(LoadLanguage, amh) { + LangLoader("amh", GetParam()); +} +TEST_P(LoadLanguage, ara) { + LangLoader("ara", GetParam()); +} +TEST_P(LoadLanguage, asm) { + LangLoader("asm", GetParam()); +} +TEST_P(LoadLanguage, aze) { + LangLoader("aze", GetParam()); +} +TEST_P(LoadLanguage, aze_cyrl) { + LangLoader("aze_cyrl", GetParam()); +} +TEST_P(LoadLanguage, bel) { + LangLoader("bel", GetParam()); +} +TEST_P(LoadLanguage, ben) { + LangLoader("ben", GetParam()); +} +TEST_P(LoadLanguage, bod) { + LangLoader("bod", GetParam()); +} +TEST_P(LoadLanguage, bos) { + LangLoader("bos", GetParam()); +} +TEST_P(LoadLanguage, bre) { + LangLoader("bre", GetParam()); +} +TEST_P(LoadLanguage, bul) { + LangLoader("bul", GetParam()); +} +TEST_P(LoadLanguage, cat) { + LangLoader("cat", GetParam()); +} +TEST_P(LoadLanguage, ceb) { + LangLoader("ceb", GetParam()); +} +TEST_P(LoadLanguage, ces) { + LangLoader("ces", GetParam()); +} +TEST_P(LoadLanguage, chi_sim) { + LangLoader("chi_sim", GetParam()); +} +TEST_P(LoadLanguage, chi_sim_vert) { + LangLoader("chi_sim_vert", GetParam()); +} +TEST_P(LoadLanguage, chi_tra) { + LangLoader("chi_tra", GetParam()); +} +TEST_P(LoadLanguage, chi_tra_vert) { + LangLoader("chi_tra_vert", GetParam()); +} +TEST_P(LoadLanguage, chr) { + LangLoader("chr", GetParam()); +} +TEST_P(LoadLanguage, cos) { + LangLoader("cos", GetParam()); +} +TEST_P(LoadLanguage, cym) { + LangLoader("cym", GetParam()); +} +TEST_P(LoadLanguage, dan) { + LangLoader("dan", GetParam()); +} +TEST_P(LoadLanguage, deu) { + LangLoader("deu", GetParam()); +} +TEST_P(LoadLanguage, div) { + LangLoader("div", GetParam()); +} +TEST_P(LoadLanguage, dzo) { + LangLoader("dzo", GetParam()); +} +TEST_P(LoadLanguage, ell) { + LangLoader("ell", GetParam()); +} +TEST_P(LoadLanguage, eng) { + LangLoader("eng", GetParam()); +} +TEST_P(LoadLanguage, enm) { + LangLoader("enm", GetParam()); +} +TEST_P(LoadLanguage, epo) { + LangLoader("epo", GetParam()); +} +TEST_P(LoadLanguage, est) { + LangLoader("est", GetParam()); +} +TEST_P(LoadLanguage, eus) { + LangLoader("eus", GetParam()); +} +TEST_P(LoadLanguage, fao) { + LangLoader("fao", GetParam()); +} +TEST_P(LoadLanguage, fas) { + LangLoader("fas", GetParam()); +} +TEST_P(LoadLanguage, fil) { + LangLoader("fil", GetParam()); +} +TEST_P(LoadLanguage, fin) { + LangLoader("fin", GetParam()); +} +TEST_P(LoadLanguage, fra) { + LangLoader("fra", GetParam()); +} +TEST_P(LoadLanguage, frk) { + LangLoader("frk", GetParam()); +} +TEST_P(LoadLanguage, frm) { + LangLoader("frm", GetParam()); +} +TEST_P(LoadLanguage, fry) { + LangLoader("fry", GetParam()); +} +TEST_P(LoadLanguage, gla) { + LangLoader("gla", GetParam()); +} +TEST_P(LoadLanguage, gle) { + LangLoader("gle", GetParam()); +} +TEST_P(LoadLanguage, glg) { + LangLoader("glg", GetParam()); +} +TEST_P(LoadLanguage, grc) { + LangLoader("grc", GetParam()); +} +TEST_P(LoadLanguage, guj) { + LangLoader("guj", GetParam()); +} +TEST_P(LoadLanguage, hat) { + LangLoader("hat", GetParam()); +} +TEST_P(LoadLanguage, heb) { + LangLoader("heb", GetParam()); +} +TEST_P(LoadLanguage, hin) { + LangLoader("hin", GetParam()); +} +TEST_P(LoadLanguage, hrv) { + LangLoader("hrv", GetParam()); +} +TEST_P(LoadLanguage, hun) { + LangLoader("hun", GetParam()); +} +TEST_P(LoadLanguage, hye) { + LangLoader("hye", GetParam()); +} +TEST_P(LoadLanguage, iku) { + LangLoader("iku", GetParam()); +} +TEST_P(LoadLanguage, ind) { + LangLoader("ind", GetParam()); +} +TEST_P(LoadLanguage, isl) { + LangLoader("isl", GetParam()); +} +TEST_P(LoadLanguage, ita) { + LangLoader("ita", GetParam()); +} +TEST_P(LoadLanguage, ita_old) { + LangLoader("ita_old", GetParam()); +} +TEST_P(LoadLanguage, jav) { + LangLoader("jav", GetParam()); +} +TEST_P(LoadLanguage, jpn) { + LangLoader("jpn", GetParam()); +} +TEST_P(LoadLanguage, jpn_vert) { + LangLoader("jpn_vert", GetParam()); +} +TEST_P(LoadLanguage, kan) { + LangLoader("kan", GetParam()); +} +TEST_P(LoadLanguage, kat) { + LangLoader("kat", GetParam()); +} +TEST_P(LoadLanguage, kat_old) { + LangLoader("kat_old", GetParam()); +} +TEST_P(LoadLanguage, kaz) { + LangLoader("kaz", GetParam()); +} +TEST_P(LoadLanguage, khm) { + LangLoader("khm", GetParam()); +} +TEST_P(LoadLanguage, kir) { + LangLoader("kir", GetParam()); +} // TEST_P(LoadLanguage, kmr) {LangLoader("kmr" , GetParam());} -TEST_P(LoadLanguage, kor) { LangLoader("kor", GetParam()); } -TEST_P(LoadLanguage, kor_vert) { LangLoader("kor_vert", GetParam()); } -TEST_P(LoadLanguage, lao) { LangLoader("lao", GetParam()); } -TEST_P(LoadLanguage, lat) { LangLoader("lat", GetParam()); } -TEST_P(LoadLanguage, lav) { LangLoader("lav", GetParam()); } -TEST_P(LoadLanguage, lit) { LangLoader("lit", GetParam()); } -TEST_P(LoadLanguage, ltz) { LangLoader("ltz", GetParam()); } -TEST_P(LoadLanguage, mal) { LangLoader("mal", GetParam()); } -TEST_P(LoadLanguage, mar) { LangLoader("mar", GetParam()); } -TEST_P(LoadLanguage, mkd) { LangLoader("mkd", GetParam()); } -TEST_P(LoadLanguage, mlt) { LangLoader("mlt", GetParam()); } -TEST_P(LoadLanguage, mon) { LangLoader("mon", GetParam()); } -TEST_P(LoadLanguage, mri) { LangLoader("mri", GetParam()); } -TEST_P(LoadLanguage, msa) { LangLoader("msa", GetParam()); } -TEST_P(LoadLanguage, mya) { LangLoader("mya", GetParam()); } -TEST_P(LoadLanguage, nep) { LangLoader("nep", GetParam()); } -TEST_P(LoadLanguage, nld) { LangLoader("nld", GetParam()); } -TEST_P(LoadLanguage, nor) { LangLoader("nor", GetParam()); } -TEST_P(LoadLanguage, oci) { LangLoader("oci", GetParam()); } -TEST_P(LoadLanguage, ori) { LangLoader("ori", GetParam()); } -TEST_P(LoadLanguage, osd) { LangLoader("osd", GetParam()); } -TEST_P(LoadLanguage, pan) { LangLoader("pan", GetParam()); } -TEST_P(LoadLanguage, pol) { LangLoader("pol", GetParam()); } -TEST_P(LoadLanguage, por) { LangLoader("por", GetParam()); } -TEST_P(LoadLanguage, pus) { LangLoader("pus", GetParam()); } -TEST_P(LoadLanguage, que) { LangLoader("que", GetParam()); } -TEST_P(LoadLanguage, ron) { LangLoader("ron", GetParam()); } -TEST_P(LoadLanguage, rus) { LangLoader("rus", GetParam()); } -TEST_P(LoadLanguage, san) { LangLoader("san", GetParam()); } -TEST_P(LoadLanguage, sin) { LangLoader("sin", GetParam()); } -TEST_P(LoadLanguage, slk) { LangLoader("slk", GetParam()); } -TEST_P(LoadLanguage, slv) { LangLoader("slv", GetParam()); } -TEST_P(LoadLanguage, snd) { LangLoader("snd", GetParam()); } -TEST_P(LoadLanguage, spa) { LangLoader("spa", GetParam()); } -TEST_P(LoadLanguage, spa_old) { LangLoader("spa_old", GetParam()); } -TEST_P(LoadLanguage, sqi) { LangLoader("sqi", GetParam()); } -TEST_P(LoadLanguage, srp) { LangLoader("srp", GetParam()); } -TEST_P(LoadLanguage, srp_latn) { LangLoader("srp_latn", GetParam()); } -TEST_P(LoadLanguage, sun) { LangLoader("sun", GetParam()); } -TEST_P(LoadLanguage, swa) { LangLoader("swa", GetParam()); } -TEST_P(LoadLanguage, swe) { LangLoader("swe", GetParam()); } -TEST_P(LoadLanguage, syr) { LangLoader("syr", GetParam()); } -TEST_P(LoadLanguage, tam) { LangLoader("tam", GetParam()); } -TEST_P(LoadLanguage, tat) { LangLoader("tat", GetParam()); } -TEST_P(LoadLanguage, tel) { LangLoader("tel", GetParam()); } -TEST_P(LoadLanguage, tgk) { LangLoader("tgk", GetParam()); } -TEST_P(LoadLanguage, tha) { LangLoader("tha", GetParam()); } -TEST_P(LoadLanguage, tir) { LangLoader("tir", GetParam()); } -TEST_P(LoadLanguage, ton) { LangLoader("ton", GetParam()); } -TEST_P(LoadLanguage, tur) { LangLoader("tur", GetParam()); } -TEST_P(LoadLanguage, uig) { LangLoader("uig", GetParam()); } -TEST_P(LoadLanguage, ukr) { LangLoader("ukr", GetParam()); } -TEST_P(LoadLanguage, urd) { LangLoader("urd", GetParam()); } -TEST_P(LoadLanguage, uzb) { LangLoader("uzb", GetParam()); } -TEST_P(LoadLanguage, uzb_cyrl) { LangLoader("uzb_cyrl", GetParam()); } -TEST_P(LoadLanguage, vie) { LangLoader("vie", GetParam()); } -TEST_P(LoadLanguage, yid) { LangLoader("yid", GetParam()); } -TEST_P(LoadLanguage, yor) { LangLoader("yor", GetParam()); } +TEST_P(LoadLanguage, kor) { + LangLoader("kor", GetParam()); +} +TEST_P(LoadLanguage, kor_vert) { + LangLoader("kor_vert", GetParam()); +} +TEST_P(LoadLanguage, lao) { + LangLoader("lao", GetParam()); +} +TEST_P(LoadLanguage, lat) { + LangLoader("lat", GetParam()); +} +TEST_P(LoadLanguage, lav) { + LangLoader("lav", GetParam()); +} +TEST_P(LoadLanguage, lit) { + LangLoader("lit", GetParam()); +} +TEST_P(LoadLanguage, ltz) { + LangLoader("ltz", GetParam()); +} +TEST_P(LoadLanguage, mal) { + LangLoader("mal", GetParam()); +} +TEST_P(LoadLanguage, mar) { + LangLoader("mar", GetParam()); +} +TEST_P(LoadLanguage, mkd) { + LangLoader("mkd", GetParam()); +} +TEST_P(LoadLanguage, mlt) { + LangLoader("mlt", GetParam()); +} +TEST_P(LoadLanguage, mon) { + LangLoader("mon", GetParam()); +} +TEST_P(LoadLanguage, mri) { + LangLoader("mri", GetParam()); +} +TEST_P(LoadLanguage, msa) { + LangLoader("msa", GetParam()); +} +TEST_P(LoadLanguage, mya) { + LangLoader("mya", GetParam()); +} +TEST_P(LoadLanguage, nep) { + LangLoader("nep", GetParam()); +} +TEST_P(LoadLanguage, nld) { + LangLoader("nld", GetParam()); +} +TEST_P(LoadLanguage, nor) { + LangLoader("nor", GetParam()); +} +TEST_P(LoadLanguage, oci) { + LangLoader("oci", GetParam()); +} +TEST_P(LoadLanguage, ori) { + LangLoader("ori", GetParam()); +} +TEST_P(LoadLanguage, osd) { + LangLoader("osd", GetParam()); +} +TEST_P(LoadLanguage, pan) { + LangLoader("pan", GetParam()); +} +TEST_P(LoadLanguage, pol) { + LangLoader("pol", GetParam()); +} +TEST_P(LoadLanguage, por) { + LangLoader("por", GetParam()); +} +TEST_P(LoadLanguage, pus) { + LangLoader("pus", GetParam()); +} +TEST_P(LoadLanguage, que) { + LangLoader("que", GetParam()); +} +TEST_P(LoadLanguage, ron) { + LangLoader("ron", GetParam()); +} +TEST_P(LoadLanguage, rus) { + LangLoader("rus", GetParam()); +} +TEST_P(LoadLanguage, san) { + LangLoader("san", GetParam()); +} +TEST_P(LoadLanguage, sin) { + LangLoader("sin", GetParam()); +} +TEST_P(LoadLanguage, slk) { + LangLoader("slk", GetParam()); +} +TEST_P(LoadLanguage, slv) { + LangLoader("slv", GetParam()); +} +TEST_P(LoadLanguage, snd) { + LangLoader("snd", GetParam()); +} +TEST_P(LoadLanguage, spa) { + LangLoader("spa", GetParam()); +} +TEST_P(LoadLanguage, spa_old) { + LangLoader("spa_old", GetParam()); +} +TEST_P(LoadLanguage, sqi) { + LangLoader("sqi", GetParam()); +} +TEST_P(LoadLanguage, srp) { + LangLoader("srp", GetParam()); +} +TEST_P(LoadLanguage, srp_latn) { + LangLoader("srp_latn", GetParam()); +} +TEST_P(LoadLanguage, sun) { + LangLoader("sun", GetParam()); +} +TEST_P(LoadLanguage, swa) { + LangLoader("swa", GetParam()); +} +TEST_P(LoadLanguage, swe) { + LangLoader("swe", GetParam()); +} +TEST_P(LoadLanguage, syr) { + LangLoader("syr", GetParam()); +} +TEST_P(LoadLanguage, tam) { + LangLoader("tam", GetParam()); +} +TEST_P(LoadLanguage, tat) { + LangLoader("tat", GetParam()); +} +TEST_P(LoadLanguage, tel) { + LangLoader("tel", GetParam()); +} +TEST_P(LoadLanguage, tgk) { + LangLoader("tgk", GetParam()); +} +TEST_P(LoadLanguage, tha) { + LangLoader("tha", GetParam()); +} +TEST_P(LoadLanguage, tir) { + LangLoader("tir", GetParam()); +} +TEST_P(LoadLanguage, ton) { + LangLoader("ton", GetParam()); +} +TEST_P(LoadLanguage, tur) { + LangLoader("tur", GetParam()); +} +TEST_P(LoadLanguage, uig) { + LangLoader("uig", GetParam()); +} +TEST_P(LoadLanguage, ukr) { + LangLoader("ukr", GetParam()); +} +TEST_P(LoadLanguage, urd) { + LangLoader("urd", GetParam()); +} +TEST_P(LoadLanguage, uzb) { + LangLoader("uzb", GetParam()); +} +TEST_P(LoadLanguage, uzb_cyrl) { + LangLoader("uzb_cyrl", GetParam()); +} +TEST_P(LoadLanguage, vie) { + LangLoader("vie", GetParam()); +} +TEST_P(LoadLanguage, yid) { + LangLoader("yid", GetParam()); +} +TEST_P(LoadLanguage, yor) { + LangLoader("yor", GetParam()); +} INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadLanguage, - ::testing::Values(TESSDATA_DIR "_fast")); + ::testing::Values(TESSDATA_DIR "_fast")); INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadLanguage, - ::testing::Values(TESSDATA_DIR "_best")); -INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage, - ::testing::Values(TESSDATA_DIR)); + ::testing::Values(TESSDATA_DIR "_best")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage, ::testing::Values(TESSDATA_DIR)); // For all scripts -class LoadScript : public QuickTest, - public ::testing::WithParamInterface {}; +class LoadScript : public QuickTest, public ::testing::WithParamInterface {}; -TEST_P(LoadScript, Arabic) { LangLoader("script/Arabic", GetParam()); } -TEST_P(LoadScript, Armenian) { LangLoader("script/Armenian", GetParam()); } -TEST_P(LoadScript, Bengali) { LangLoader("script/Bengali", GetParam()); } +TEST_P(LoadScript, Arabic) { + LangLoader("script/Arabic", GetParam()); +} +TEST_P(LoadScript, Armenian) { + LangLoader("script/Armenian", GetParam()); +} +TEST_P(LoadScript, Bengali) { + LangLoader("script/Bengali", GetParam()); +} TEST_P(LoadScript, Canadian_Aboriginal) { LangLoader("script/Canadian_Aboriginal", GetParam()); } -TEST_P(LoadScript, Cherokee) { LangLoader("script/Cherokee", GetParam()); } -TEST_P(LoadScript, Cyrillic) { LangLoader("script/Cyrillic", GetParam()); } -TEST_P(LoadScript, Devanagari) { LangLoader("script/Devanagari", GetParam()); } -TEST_P(LoadScript, Ethiopic) { LangLoader("script/Ethiopic", GetParam()); } -TEST_P(LoadScript, Fraktur) { LangLoader("script/Fraktur", GetParam()); } -TEST_P(LoadScript, Georgian) { LangLoader("script/Georgian", GetParam()); } -TEST_P(LoadScript, Greek) { LangLoader("script/Greek", GetParam()); } -TEST_P(LoadScript, Gujarati) { LangLoader("script/Gujarati", GetParam()); } -TEST_P(LoadScript, Gurmukhi) { LangLoader("script/Gurmukhi", GetParam()); } -TEST_P(LoadScript, HanS) { LangLoader("script/HanS", GetParam()); } -TEST_P(LoadScript, HanS_vert) { LangLoader("script/HanS_vert", GetParam()); } -TEST_P(LoadScript, HanT) { LangLoader("script/HanT", GetParam()); } -TEST_P(LoadScript, HanT_vert) { LangLoader("script/HanT_vert", GetParam()); } -TEST_P(LoadScript, Hangul) { LangLoader("script/Hangul", GetParam()); } +TEST_P(LoadScript, Cherokee) { + LangLoader("script/Cherokee", GetParam()); +} +TEST_P(LoadScript, Cyrillic) { + LangLoader("script/Cyrillic", GetParam()); +} +TEST_P(LoadScript, Devanagari) { + LangLoader("script/Devanagari", GetParam()); +} +TEST_P(LoadScript, Ethiopic) { + LangLoader("script/Ethiopic", GetParam()); +} +TEST_P(LoadScript, Fraktur) { + LangLoader("script/Fraktur", GetParam()); +} +TEST_P(LoadScript, Georgian) { + LangLoader("script/Georgian", GetParam()); +} +TEST_P(LoadScript, Greek) { + LangLoader("script/Greek", GetParam()); +} +TEST_P(LoadScript, Gujarati) { + LangLoader("script/Gujarati", GetParam()); +} +TEST_P(LoadScript, Gurmukhi) { + LangLoader("script/Gurmukhi", GetParam()); +} +TEST_P(LoadScript, HanS) { + LangLoader("script/HanS", GetParam()); +} +TEST_P(LoadScript, HanS_vert) { + LangLoader("script/HanS_vert", GetParam()); +} +TEST_P(LoadScript, HanT) { + LangLoader("script/HanT", GetParam()); +} +TEST_P(LoadScript, HanT_vert) { + LangLoader("script/HanT_vert", GetParam()); +} +TEST_P(LoadScript, Hangul) { + LangLoader("script/Hangul", GetParam()); +} TEST_P(LoadScript, Hangul_vert) { LangLoader("script/Hangul_vert", GetParam()); } -TEST_P(LoadScript, Hebrew) { LangLoader("script/Hebrew", GetParam()); } -TEST_P(LoadScript, Japanese) { LangLoader("script/Japanese", GetParam()); } +TEST_P(LoadScript, Hebrew) { + LangLoader("script/Hebrew", GetParam()); +} +TEST_P(LoadScript, Japanese) { + LangLoader("script/Japanese", GetParam()); +} TEST_P(LoadScript, Japanese_vert) { LangLoader("script/Japanese_vert", GetParam()); } -TEST_P(LoadScript, Kannada) { LangLoader("script/Kannada", GetParam()); } -TEST_P(LoadScript, Khmer) { LangLoader("script/Khmer", GetParam()); } -TEST_P(LoadScript, Lao) { LangLoader("script/Lao", GetParam()); } -TEST_P(LoadScript, Latin) { LangLoader("script/Latin", GetParam()); } -TEST_P(LoadScript, Malayalam) { LangLoader("script/Malayalam", GetParam()); } -TEST_P(LoadScript, Myanmar) { LangLoader("script/Myanmar", GetParam()); } -TEST_P(LoadScript, Oriya) { LangLoader("script/Oriya", GetParam()); } -TEST_P(LoadScript, Sinhala) { LangLoader("script/Sinhala", GetParam()); } -TEST_P(LoadScript, Syriac) { LangLoader("script/Syriac", GetParam()); } -TEST_P(LoadScript, Tamil) { LangLoader("script/Tamil", GetParam()); } -TEST_P(LoadScript, Telugu) { LangLoader("script/Telugu", GetParam()); } -TEST_P(LoadScript, Thaana) { LangLoader("script/Thaana", GetParam()); } -TEST_P(LoadScript, Thai) { LangLoader("script/Thai", GetParam()); } -TEST_P(LoadScript, Tibetan) { LangLoader("script/Tibetan", GetParam()); } -TEST_P(LoadScript, Vietnamese) { LangLoader("script/Vietnamese", GetParam()); } +TEST_P(LoadScript, Kannada) { + LangLoader("script/Kannada", GetParam()); +} +TEST_P(LoadScript, Khmer) { + LangLoader("script/Khmer", GetParam()); +} +TEST_P(LoadScript, Lao) { + LangLoader("script/Lao", GetParam()); +} +TEST_P(LoadScript, Latin) { + LangLoader("script/Latin", GetParam()); +} +TEST_P(LoadScript, Malayalam) { + LangLoader("script/Malayalam", GetParam()); +} +TEST_P(LoadScript, Myanmar) { + LangLoader("script/Myanmar", GetParam()); +} +TEST_P(LoadScript, Oriya) { + LangLoader("script/Oriya", GetParam()); +} +TEST_P(LoadScript, Sinhala) { + LangLoader("script/Sinhala", GetParam()); +} +TEST_P(LoadScript, Syriac) { + LangLoader("script/Syriac", GetParam()); +} +TEST_P(LoadScript, Tamil) { + LangLoader("script/Tamil", GetParam()); +} +TEST_P(LoadScript, Telugu) { + LangLoader("script/Telugu", GetParam()); +} +TEST_P(LoadScript, Thaana) { + LangLoader("script/Thaana", GetParam()); +} +TEST_P(LoadScript, Thai) { + LangLoader("script/Thai", GetParam()); +} +TEST_P(LoadScript, Tibetan) { + LangLoader("script/Tibetan", GetParam()); +} +TEST_P(LoadScript, Vietnamese) { + LangLoader("script/Vietnamese", GetParam()); +} INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadScript, - ::testing::Values(TESSDATA_DIR "_fast")); + ::testing::Values(TESSDATA_DIR "_fast")); INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadScript, - ::testing::Values(TESSDATA_DIR "_best")); -INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript, - ::testing::Values(TESSDATA_DIR)); + ::testing::Values(TESSDATA_DIR "_best")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript, ::testing::Values(TESSDATA_DIR)); class LoadLang : public QuickTest {}; // Test Load of English here, as the parameterized tests are disabled by // default. -TEST_F(LoadLang, engFast) { LangLoader("eng", TESSDATA_DIR "_fast"); } -TEST_F(LoadLang, engBest) { LangLoader("eng", TESSDATA_DIR "_best"); } -TEST_F(LoadLang, engBestInt) { LangLoader("eng", TESSDATA_DIR); } +TEST_F(LoadLang, engFast) { + LangLoader("eng", TESSDATA_DIR "_fast"); +} +TEST_F(LoadLang, engBest) { + LangLoader("eng", TESSDATA_DIR "_best"); +} +TEST_F(LoadLang, engBestInt) { + LangLoader("eng", TESSDATA_DIR); +} // Use class LoadLang for languages which are NOT there in all three repos -TEST_F(LoadLang, kmrFast) { LangLoader("kmr", TESSDATA_DIR "_fast"); } -TEST_F(LoadLang, kmrBest) { LangLoader("kmr", TESSDATA_DIR "_best"); } +TEST_F(LoadLang, kmrFast) { + LangLoader("kmr", TESSDATA_DIR "_fast"); +} +TEST_F(LoadLang, kmrBest) { + LangLoader("kmr", TESSDATA_DIR "_best"); +} // TEST_F(LoadLang, kmrBestInt) {LangLoader("kmr" , TESSDATA_DIR);} -} // namespace +} // namespace tesseract diff --git a/unittest/log.h b/unittest/log.h index 0b21f3ee..9ee7ec65 100644 --- a/unittest/log.h +++ b/unittest/log.h @@ -25,15 +25,12 @@ #include -enum LogLevel { - INFO, WARNING, ERROR, FATAL -}; +enum LogLevel { INFO, WARNING, ERROR, FATAL }; // Avoid conflict with logging.h from TensorFlow. #undef LOG -static inline std::ostream& LOG(enum LogLevel level) -{ +static inline std::ostream &LOG(enum LogLevel level) { switch (level) { case INFO: std::cout << "[INFO] "; @@ -55,8 +52,7 @@ static inline std::ostream& LOG(enum LogLevel level) #undef QCHECK // https://github.com/google/ion/blob/master/ion/base/logging.h -static inline std::ostream& QCHECK(bool condition) -{ +static inline std::ostream &QCHECK(bool condition) { if (condition) { static std::ostream null_stream(nullptr); return null_stream; @@ -64,4 +60,4 @@ static inline std::ostream& QCHECK(bool condition) return std::cout; } -#endif // TESSERACT_UNITTEST_LOG_H_ +#endif // TESSERACT_UNITTEST_LOG_H_ diff --git a/unittest/lstm_recode_test.cc b/unittest/lstm_recode_test.cc index 5365bf4b..b8121a4a 100644 --- a/unittest/lstm_recode_test.cc +++ b/unittest/lstm_recode_test.cc @@ -22,8 +22,8 @@ TEST_F(LSTMTrainerTest, RecodeTestKorBase) { "kor.Arial_Unicode_MS.exp0.lstmf", false, true, 5e-4, false, "kor"); double kor_full_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(kor_full_err, 88); -// EXPECT_GT(kor_full_err, 85); - LOG(INFO) << "********** Expected < 88 ************\n" ; + // EXPECT_GT(kor_full_err, 85); + LOG(INFO) << "********** Expected < 88 ************\n"; } TEST_F(LSTMTrainerTest, RecodeTestKor) { @@ -32,7 +32,7 @@ TEST_F(LSTMTrainerTest, RecodeTestKor) { "kor.Arial_Unicode_MS.exp0.lstmf", true, true, 5e-4, false, "kor"); double kor_recode_err = TrainIterations(kTrainerIterations); EXPECT_LT(kor_recode_err, 60); - LOG(INFO) << "********** Expected < 60 ************\n" ; + LOG(INFO) << "********** Expected < 60 ************\n"; } // Tests that the given string encodes and decodes back to the same @@ -42,4 +42,4 @@ TEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) { TestEncodeDecodeBoth("kor", "한국어 위키백과에 오신 것을 환영합니다!"); } -} // namespace tesseract. +} // namespace tesseract. diff --git a/unittest/lstm_squashed_test.cc b/unittest/lstm_squashed_test.cc index 1dd08746..2a07ca06 100644 --- a/unittest/lstm_squashed_test.cc +++ b/unittest/lstm_squashed_test.cc @@ -20,12 +20,12 @@ TEST_F(LSTMTrainerTest, TestSquashed) { // a small convolution/maxpool below that. // Match training conditions to those typically used with this spec: // recoding on, adam on. - SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]", - "SQU-2-layer-lstm", /*recode*/ true, /*adam*/ true); + SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]", "SQU-2-layer-lstm", + /*recode*/ true, /*adam*/ true); double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2); EXPECT_LT(lstm_2d_err, 80); - LOG(INFO) << "********** < 80 ************\n" ; + LOG(INFO) << "********** < 80 ************\n"; TestIntMode(kTrainerIterations); } -} // namespace tesseract. +} // namespace tesseract. diff --git a/unittest/lstm_test.cc b/unittest/lstm_test.cc index 930384a6..4f053b1b 100644 --- a/unittest/lstm_test.cc +++ b/unittest/lstm_test.cc @@ -31,17 +31,16 @@ TEST_F(LSTMTrainerTest, BasicTest) { SetupTrainer( "[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 " "Ct1,1,64O1c1]", - "no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false, - 2e-4, false, "eng"); + "no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false, 2e-4, false, "eng"); double non_lstm_err = TrainIterations(kTrainerIterations * 4); EXPECT_LT(non_lstm_err, 98); - LOG(INFO) << "********** Expected < 98 ************\n" ; + LOG(INFO) << "********** Expected < 98 ************\n"; // A basic single-layer, single direction LSTM. SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false); double lstm_uni_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_uni_err, 86); - LOG(INFO) << "********** Expected < 86 ************\n" ; + LOG(INFO) << "********** Expected < 86 ************\n"; // Beats the convolver. (Although it does have a lot more weights, it still // iterates faster.) EXPECT_LT(lstm_uni_err, non_lstm_err); @@ -50,12 +49,11 @@ TEST_F(LSTMTrainerTest, BasicTest) { // Color learns almost as fast as normalized grey/2D. TEST_F(LSTMTrainerTest, ColorTest) { // A basic single-layer, single direction LSTM. - SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", - "2D-color-lstm", true, true); + SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2D-color-lstm", true, true); double lstm_uni_err = TrainIterations(kTrainerIterations); EXPECT_LT(lstm_uni_err, 85); -// EXPECT_GT(lstm_uni_err, 66); - LOG(INFO) << "********** Expected < 85 ************\n" ; + // EXPECT_GT(lstm_uni_err, 66); + LOG(INFO) << "********** Expected < 85 ************\n"; } TEST_F(LSTMTrainerTest, BidiTest) { @@ -63,7 +61,7 @@ TEST_F(LSTMTrainerTest, BidiTest) { SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false); double lstm_bi_err = TrainIterations(kTrainerIterations); EXPECT_LT(lstm_bi_err, 75); - LOG(INFO) << "********** Expected < 75 ************\n" ; + LOG(INFO) << "********** Expected < 75 ************\n"; // Int mode training is dead, so convert the trained network to int and check // that its error rate is close to the float version. TestIntMode(kTrainerIterations); @@ -73,12 +71,12 @@ TEST_F(LSTMTrainerTest, BidiTest) { // It takes a lot of iterations to get there. TEST_F(LSTMTrainerTest, Test2D) { // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom. - SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", - "2-D-2-layer-lstm", false, false); - double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2 ); + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, + false); + double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2); EXPECT_LT(lstm_2d_err, 98); -// EXPECT_GT(lstm_2d_err, 90); - LOG(INFO) << "********** Expected < 98 ************\n" ; + // EXPECT_GT(lstm_2d_err, 90); + LOG(INFO) << "********** Expected < 98 ************\n"; // Int mode training is dead, so convert the trained network to int and check // that its error rate is close to the float version. TestIntMode(kTrainerIterations); @@ -88,11 +86,11 @@ TEST_F(LSTMTrainerTest, Test2D) { // without it. TEST_F(LSTMTrainerTest, TestAdam) { // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom. - SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", - "2-D-2-layer-lstm", false, true); + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, + true); double lstm_2d_err = TrainIterations(kTrainerIterations); EXPECT_LT(lstm_2d_err, 70); - LOG(INFO) << "********** Expected < 70 ************\n" ; + LOG(INFO) << "********** Expected < 70 ************\n"; TestIntMode(kTrainerIterations); } @@ -103,22 +101,21 @@ TEST_F(LSTMTrainerTest, SpeedTest) { "O1c1]", "2-D-2-layer-lstm", false, true); TrainIterations(kTrainerIterations); - LOG(INFO) << "********** *** ************\n" ; + LOG(INFO) << "********** *** ************\n"; } // Tests that two identical networks trained the same get the same results. // Also tests that the same happens with a serialize/deserialize in the middle. TEST_F(LSTMTrainerTest, DeterminismTest) { - SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", - "2-D-2-layer-lstm", false, false); + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, + false); double lstm_2d_err_a = TrainIterations(kTrainerIterations); double act_error_a = trainer_->ActivationError(); double char_error_a = trainer_->CharError(); std::vector trainer_a_data; - EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), - &trainer_a_data)); - SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", - "2-D-2-layer-lstm", false, false); + EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), &trainer_a_data)); + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, + false); double lstm_2d_err_b = TrainIterations(kTrainerIterations); double act_error_b = trainer_->ActivationError(); double char_error_b = trainer_->CharError(); @@ -130,8 +127,8 @@ TEST_F(LSTMTrainerTest, DeterminismTest) { act_error_b = trainer_->ActivationError(); char_error_b = trainer_->CharError(); // Unpack into a new trainer and train that some more too. - SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", - "2-D-2-layer-lstm", false, false); + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false, + false); EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, trainer_.get())); lstm_2d_err_a = TrainIterations(kTrainerIterations / 3); act_error_a = trainer_->ActivationError(); @@ -139,7 +136,7 @@ TEST_F(LSTMTrainerTest, DeterminismTest) { EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b); EXPECT_FLOAT_EQ(act_error_a, act_error_b); EXPECT_FLOAT_EQ(char_error_a, char_error_b); - LOG(INFO) << "********** *** ************\n" ; + LOG(INFO) << "********** *** ************\n"; } // The baseline network against which to test the built-in softmax. @@ -148,8 +145,8 @@ TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) { SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true); double lstm_uni_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_uni_err, 60); -// EXPECT_GT(lstm_uni_err, 48); - LOG(INFO) << "********** Expected < 60 ************\n" ; + // EXPECT_GT(lstm_uni_err, 48); + LOG(INFO) << "********** Expected < 60 ************\n"; // Check that it works in int mode too. TestIntMode(kTrainerIterations); // If we run TestIntMode again, it tests that int_mode networks can @@ -168,7 +165,7 @@ TEST_F(LSTMTrainerTest, SoftmaxTest) { SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true); double lstm_sm_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_sm_err, 49.0); - LOG(INFO) << "********** Expected < 49 ************\n" ; + LOG(INFO) << "********** Expected < 49 ************\n"; // Check that it works in int mode too. TestIntMode(kTrainerIterations); } @@ -180,7 +177,7 @@ TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) { SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true); double lstm_sm_err = TrainIterations(kTrainerIterations * 2); EXPECT_LT(lstm_sm_err, 62.0); - LOG(INFO) << "********** Expected < 62 ************\n" ; + LOG(INFO) << "********** Expected < 62 ************\n"; // Check that it works in int mode too. TestIntMode(kTrainerIterations); } @@ -188,16 +185,13 @@ TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) { // Tests that layer access methods work correctly. TEST_F(LSTMTrainerTest, TestLayerAccess) { // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom. - SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm", - false, false); + SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm", false, false); // Number of layers. const int kNumLayers = 8; // Expected layer names. - const char* kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2", - ":3:0", ":4:0", ":4:1:0", ":5"}; - const char* kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL", - "Maxpool", "Lfys32", "Lbx128LTR", - "Lbx128", "Output"}; + const char *kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2", ":3:0", ":4:0", ":4:1:0", ":5"}; + const char *kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL", "Maxpool", + "Lfys32", "Lbx128LTR", "Lbx128", "Output"}; // Expected number of weights. const int kNumWeights[kNumLayers] = {0, 0, @@ -212,10 +206,9 @@ TEST_F(LSTMTrainerTest, TestLayerAccess) { EXPECT_EQ(kNumLayers, layers.size()); for (int i = 0; i < kNumLayers && i < layers.size(); ++i) { EXPECT_STREQ(kLayerIds[i], layers[i].c_str()); - EXPECT_STREQ(kLayerNames[i], - trainer_->GetLayer(layers[i])->name().c_str()); + EXPECT_STREQ(kLayerNames[i], trainer_->GetLayer(layers[i])->name().c_str()); EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights()); } } -} // namespace tesseract. +} // namespace tesseract. diff --git a/unittest/lstm_test.h b/unittest/lstm_test.h index 23793ce6..cb4e0062 100644 --- a/unittest/lstm_test.h +++ b/unittest/lstm_test.h @@ -19,12 +19,12 @@ #include "include_gunit.h" #include "absl/strings/str_cat.h" -#include "tprintf.h" #include "helpers.h" +#include "tprintf.h" #include "functions.h" #include "lang_model_helpers.h" -#include "log.h" // for LOG +#include "log.h" // for LOG #include "lstmtrainer.h" #include "unicharset.h" @@ -44,62 +44,57 @@ const int kBatchIterations = 1; // The fixture for testing LSTMTrainer. class LSTMTrainerTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); } LSTMTrainerTest() {} - std::string TestDataNameToPath(const std::string& name) { - return file::JoinPath(TESTDATA_DIR, - "" + name); + std::string TestDataNameToPath(const std::string &name) { + return file::JoinPath(TESTDATA_DIR, "" + name); } - std::string TessDataNameToPath(const std::string& name) { - return file::JoinPath(TESSDATA_DIR, - "" + name); + std::string TessDataNameToPath(const std::string &name) { + return file::JoinPath(TESSDATA_DIR, "" + name); } - std::string TestingNameToPath(const std::string& name) { - return file::JoinPath(TESTING_DIR, - "" + name); + std::string TestingNameToPath(const std::string &name) { + return file::JoinPath(TESTING_DIR, "" + name); } - void SetupTrainerEng(const std::string& network_spec, const std::string& model_name, - bool recode, bool adam) { - SetupTrainer(network_spec, model_name, "eng/eng.unicharset", - "eng.Arial.exp0.lstmf", recode, adam, 5e-4, false, "eng"); + void SetupTrainerEng(const std::string &network_spec, const std::string &model_name, bool recode, + bool adam) { + SetupTrainer(network_spec, model_name, "eng/eng.unicharset", "eng.Arial.exp0.lstmf", recode, + adam, 5e-4, false, "eng"); } - void SetupTrainer(const std::string& network_spec, const std::string& model_name, - const std::string& unicharset_file, const std::string& lstmf_file, - bool recode, bool adam, float learning_rate, - bool layer_specific, const std::string& kLang) { -// constexpr char kLang[] = "eng"; // Exact value doesn't matter. + void SetupTrainer(const std::string &network_spec, const std::string &model_name, + const std::string &unicharset_file, const std::string &lstmf_file, bool recode, + bool adam, float learning_rate, bool layer_specific, const std::string &kLang) { + // constexpr char kLang[] = "eng"; // Exact value doesn't matter. std::string unicharset_name = TestDataNameToPath(unicharset_file); UNICHARSET unicharset; ASSERT_TRUE(unicharset.load_from_file(unicharset_name.c_str(), false)); - std::string script_dir = file::JoinPath( - LANGDATA_DIR, ""); + std::string script_dir = file::JoinPath(LANGDATA_DIR, ""); std::vector words; - EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir, - kLang, !recode, words, words, words, false, - nullptr, nullptr)); + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir, kLang, !recode, + words, words, words, false, nullptr, nullptr)); std::string model_path = file::JoinPath(FLAGS_test_tmpdir, model_name); std::string checkpoint_path = model_path + "_checkpoint"; - trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(), - 0, 0)); - trainer_->InitCharSet(file::JoinPath(FLAGS_test_tmpdir, kLang, - absl::StrCat(kLang, ".traineddata"))); + trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(), 0, 0)); + trainer_->InitCharSet( + file::JoinPath(FLAGS_test_tmpdir, kLang, absl::StrCat(kLang, ".traineddata"))); int net_mode = adam ? NF_ADAM : 0; // Adam needs a higher learning rate, due to not multiplying the effective // rate by 1/(1-momentum). - if (adam) learning_rate *= 20.0f; - if (layer_specific) net_mode |= NF_LAYER_SPECIFIC_LR; - EXPECT_TRUE(trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1, - learning_rate, 0.9, 0.999)); + if (adam) + learning_rate *= 20.0f; + if (layer_specific) + net_mode |= NF_LAYER_SPECIFIC_LR; + EXPECT_TRUE( + trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1, learning_rate, 0.9, 0.999)); std::vector filenames; filenames.push_back(STRING(TestDataNameToPath(lstmf_file).c_str())); EXPECT_TRUE(trainer_->LoadAllTrainingData(filenames, CS_SEQUENTIAL, false)); - LOG(INFO) << "Setup network:" << model_name << "\n" ; + LOG(INFO) << "Setup network:" << model_name << "\n"; } // Trains for a given number of iterations and returns the char error rate. double TrainIterations(int max_iterations) { @@ -119,7 +114,8 @@ class LSTMTrainerTest : public testing::Test { trainer_->MaintainCheckpoints(nullptr, &log_str); iteration = trainer_->training_iteration(); mean_error *= 100.0 / kBatchIterations; - if (mean_error < best_error) best_error = mean_error; + if (mean_error < best_error) + best_error = mean_error; } while (iteration < iteration_limit); LOG(INFO) << "Trainer error rate = " << best_error << "\n"; return best_error; @@ -131,18 +127,17 @@ class LSTMTrainerTest : public testing::Test { double mean_error = 0.0; int error_count = 0; while (error_count < max_iterations) { - const ImageData& trainingdata = + const ImageData &trainingdata = *trainer_->mutable_training_data()->GetPageBySerial(iteration); NetworkIO fwd_outputs, targets; - if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) != - UNENCODABLE) { + if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) != UNENCODABLE) { mean_error += trainer_->NewSingleError(ET_CHAR_ERROR); ++error_count; } trainer_->SetIteration(++iteration); } mean_error *= 100.0 / max_iterations; - LOG(INFO) << "Tester error rate = " << mean_error << "\n" ; + LOG(INFO) << "Tester error rate = " << mean_error << "\n"; return mean_error; } // Tests that the current trainer_ can be converted to int mode and still gets @@ -150,8 +145,7 @@ class LSTMTrainerTest : public testing::Test { // int. double TestIntMode(int test_iterations) { std::vector trainer_data; - EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), - &trainer_data)); + EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), &trainer_data)); // Get the error on the next few iterations in float mode. double float_err = TestIterations(test_iterations); // Restore the dump, convert to int and test error on that. @@ -164,11 +158,11 @@ class LSTMTrainerTest : public testing::Test { // Sets up a trainer with the given language and given recode+ctc condition. // It then verifies that the given str encodes and decodes back to the same // string. - void TestEncodeDecode(const std::string& lang, const std::string& str, bool recode) { + void TestEncodeDecode(const std::string &lang, const std::string &str, bool recode) { std::string unicharset_name = lang + "/" + lang + ".unicharset"; - std::string lstmf_name = lang + ".Arial_Unicode_MS.exp0.lstmf"; - SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name, - lstmf_name, recode, true, 5e-4f, true, lang); + std::string lstmf_name = lang + ".Arial_Unicode_MS.exp0.lstmf"; + SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name, lstmf_name, recode, true, + 5e-4f, true, lang); std::vector labels; EXPECT_TRUE(trainer_->EncodeString(str.c_str(), &labels)); STRING decoded = trainer_->DecodeLabels(labels); @@ -176,7 +170,7 @@ class LSTMTrainerTest : public testing::Test { EXPECT_EQ(str, decoded_str); } // Calls TestEncodeDeode with both recode on and off. - void TestEncodeDecodeBoth(const std::string& lang, const std::string& str) { + void TestEncodeDecodeBoth(const std::string &lang, const std::string &str) { TestEncodeDecode(lang, str, false); TestEncodeDecode(lang, str, true); } @@ -184,6 +178,6 @@ class LSTMTrainerTest : public testing::Test { std::unique_ptr trainer_; }; -} // namespace tesseract. +} // namespace tesseract. -#endif // THIRD_PARTY_TESSERACT_UNITTEST_LSTM_TEST_H_ +#endif // THIRD_PARTY_TESSERACT_UNITTEST_LSTM_TEST_H_ diff --git a/unittest/lstmtrainer_test.cc b/unittest/lstmtrainer_test.cc index 3e0dcf55..01a85994 100644 --- a/unittest/lstmtrainer_test.cc +++ b/unittest/lstmtrainer_test.cc @@ -16,8 +16,7 @@ namespace tesseract { TEST_F(LSTMTrainerTest, EncodesEng) { - TestEncodeDecodeBoth("eng", - "The quick brown 'fox' jumps over: the lazy dog!"); + TestEncodeDecodeBoth("eng", "The quick brown 'fox' jumps over: the lazy dog!"); } TEST_F(LSTMTrainerTest, EncodesKan) { @@ -25,8 +24,7 @@ TEST_F(LSTMTrainerTest, EncodesKan) { } TEST_F(LSTMTrainerTest, EncodesKor) { - TestEncodeDecodeBoth("kor", - "이는 것으로 다시 넣을 수는 있지만 선택의 의미는"); + TestEncodeDecodeBoth("kor", "이는 것으로 다시 넣을 수는 있지만 선택의 의미는"); } TEST_F(LSTMTrainerTest, MapCoder) { @@ -47,16 +45,15 @@ TEST_F(LSTMTrainerTest, MapCoder) { std::vector fra_labels; EXPECT_TRUE(fra_trainer.EncodeString(kTestStr.c_str(), &fra_labels)); // Use the mapper to compute what the labels are as deu. - std::vector mapping = fra_trainer.MapRecoder(deu_trainer.GetUnicharset(), - deu_trainer.GetRecoder()); + std::vector mapping = + fra_trainer.MapRecoder(deu_trainer.GetUnicharset(), deu_trainer.GetRecoder()); std::vector mapped_fra_labels(fra_labels.size(), -1); for (int i = 0; i < fra_labels.size(); ++i) { mapped_fra_labels[i] = mapping[fra_labels[i]]; EXPECT_NE(-1, mapped_fra_labels[i]) << "i=" << i << ", ch=" << kTestStr[i]; EXPECT_EQ(mapped_fra_labels[i], deu_labels[i]) - << "i=" << i << ", ch=" << kTestStr[i] - << " has deu label=" << deu_labels[i] << ", but mapped to " - << mapped_fra_labels[i]; + << "i=" << i << ", ch=" << kTestStr[i] << " has deu label=" << deu_labels[i] + << ", but mapped to " << mapped_fra_labels[i]; } // The german trainer can now decode them correctly. STRING decoded = deu_trainer.DecodeLabels(mapped_fra_labels); @@ -73,10 +70,9 @@ TEST_F(LSTMTrainerTest, ConvertModel) { deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata")); // Load the fra traineddata, strip out the model, and save to a tmp file. TessdataManager mgr; - std::string fra_data = - file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata"); + std::string fra_data = file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata"); CHECK(mgr.Init(fra_data.c_str())); - LOG(INFO) << "Load " << fra_data << "\n"; + LOG(INFO) << "Load " << fra_data << "\n"; file::MakeTmpdir(); std::string model_path = file::JoinPath(FLAGS_test_tmpdir, "fra.lstm"); CHECK(mgr.ExtractToFile(model_path.c_str())); @@ -91,16 +87,16 @@ TEST_F(LSTMTrainerTest, ConvertModel) { // baseapi_test.cc). TessBaseAPI api; api.Init(FLAGS_test_tmpdir, "deu", tesseract::OEM_LSTM_ONLY); - Pix* src_pix = pixRead(TestingNameToPath("phototest.tif").c_str()); + Pix *src_pix = pixRead(TestingNameToPath("phototest.tif").c_str()); CHECK(src_pix); api.SetImage(src_pix); std::unique_ptr result(api.GetUTF8Text()); std::string truth_text; - CHECK_OK(file::GetContents(TestingNameToPath("phototest.gold.txt"), - &truth_text, file::Defaults())); + CHECK_OK( + file::GetContents(TestingNameToPath("phototest.gold.txt"), &truth_text, file::Defaults())); EXPECT_STREQ(truth_text.c_str(), result.get()); pixDestroy(&src_pix); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/mastertrainer_test.cc b/unittest/mastertrainer_test.cc index 0f93e221..8b0cd099 100644 --- a/unittest/mastertrainer_test.cc +++ b/unittest/mastertrainer_test.cc @@ -23,17 +23,17 @@ #include "include_gunit.h" -#include "log.h" // for LOG -#include "unicharset.h" +#include "commontraining.h" #include "errorcounter.h" +#include "log.h" // for LOG #include "mastertrainer.h" #include "shapeclassifier.h" #include "shapetable.h" #include "trainingsample.h" -#include "commontraining.h" +#include "unicharset.h" -#include "absl/strings/numbers.h" // for safe_strto32 -#include "absl/strings/str_split.h" // for absl::StrSplit +#include "absl/strings/numbers.h" // for safe_strto32 +#include "absl/strings/str_split.h" // for absl::StrSplit #include #include @@ -51,12 +51,10 @@ static const int kNumCorrect = kNumNonReject - kNumTop1Errs; // The total number of answers is given by the number of non-rejects plus // all the multiple answers. static const int kNumAnswers = kNumNonReject + 2 * (kNumTop2Errs - kNumTopNErrs) + - (kNumTop1Errs - kNumTop2Errs) + - (kNumTopTopErrs - kNumTop1Errs); + (kNumTop1Errs - kNumTop2Errs) + (kNumTopTopErrs - kNumTop1Errs); #ifndef DISABLED_LEGACY_ENGINE -static bool safe_strto32(const std::string& str, int* pResult) -{ +static bool safe_strto32(const std::string &str, int *pResult) { long n = strtol(str.c_str(), nullptr, 0); *pResult = n; return true; @@ -66,8 +64,8 @@ static bool safe_strto32(const std::string& str, int* pResult) // Mock ShapeClassifier that cheats by looking at the correct answer, and // creates a specific pattern of errors that can be tested. class MockClassifier : public ShapeClassifier { - public: - explicit MockClassifier(ShapeTable* shape_table) +public: + explicit MockClassifier(ShapeTable *shape_table) : shape_table_(shape_table), num_done_(0), done_bad_font_(false) { // Add a false font answer to the shape table. We pick a random unichar_id, // add a new shape for it with a false font. Font must actually exist in @@ -83,12 +81,12 @@ class MockClassifier : public ShapeClassifier { // If keep_this (a shape index) is >= 0, then the results should always // contain keep_this, and (if possible) anything of intermediate confidence. // The return value is the number of classes saved in results. - int ClassifySample(const TrainingSample& sample, Pix* page_pix, - int debug, UNICHAR_ID keep_this, - std::vector* results) override { + int ClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, + std::vector *results) override { results->clear(); // Everything except the first kNumNonReject is a reject. - if (++num_done_ > kNumNonReject) return 0; + if (++num_done_ > kNumNonReject) + return 0; int class_id = sample.class_id(); int font_id = sample.font_id(); @@ -125,11 +123,13 @@ class MockClassifier : public ShapeClassifier { return results->size(); } // Provides access to the ShapeTable that this classifier works with. - const ShapeTable* GetShapeTable() const override { return shape_table_; } + const ShapeTable *GetShapeTable() const override { + return shape_table_; + } - private: +private: // Borrowed pointer to the ShapeTable. - ShapeTable* shape_table_; + ShapeTable *shape_table_; // Unichar_id of a random character that occurs after the first 60 samples. int false_unichar_id_; // Shape index of prepared false answer for false_unichar_id. @@ -145,16 +145,16 @@ const double kMin1lDistance = 0.25; // The fixture for testing Tesseract. class MasterTrainerTest : public testing::Test { #ifndef DISABLED_LEGACY_ENGINE - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); } - std::string TestDataNameToPath(const std::string& name) { + std::string TestDataNameToPath(const std::string &name) { return file::JoinPath(TESTING_DIR, name); } - std::string TmpNameToPath(const std::string& name) { + std::string TmpNameToPath(const std::string &name) { return file::JoinPath(FLAGS_test_tmpdir, name); } @@ -175,13 +175,12 @@ class MasterTrainerTest : public testing::Test { FLAGS_X = TestDataNameToPath("eng.xheights").c_str(); FLAGS_U = TestDataNameToPath("eng.unicharset").c_str(); std::string tr_file_name(TestDataNameToPath("eng.Arial.exp0.tr")); - const char* argv[] = {tr_file_name.c_str()}; + const char *argv[] = {tr_file_name.c_str()}; int argc = 1; STRING file_prefix; delete shape_table_; shape_table_ = nullptr; - master_trainer_ = - LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix); + master_trainer_ = LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix); EXPECT_TRUE(master_trainer_ != nullptr); EXPECT_TRUE(shape_table_ != nullptr); } @@ -207,34 +206,28 @@ class MasterTrainerTest : public testing::Test { int shape_1 = shape_table_->FindShape(unichar_1, font_id); EXPECT_GE(shape_1, 0); - float dist_I_l = - master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l); + float dist_I_l = master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l); // No tolerance here. We expect that I and l should match exactly. EXPECT_EQ(0.0f, dist_I_l); - float dist_l_I = - master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I); + float dist_l_I = master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I); // BOTH ways. EXPECT_EQ(0.0f, dist_l_I); // l/1 on the other hand should be distinct. - float dist_l_1 = - master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1); + float dist_l_1 = master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1); EXPECT_GT(dist_l_1, kMin1lDistance); - float dist_1_l = - master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l); + float dist_1_l = master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l); EXPECT_GT(dist_1_l, kMin1lDistance); // So should I/1. - float dist_I_1 = - master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1); + float dist_I_1 = master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1); EXPECT_GT(dist_I_1, kMin1lDistance); - float dist_1_I = - master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I); + float dist_1_I = master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I); EXPECT_GT(dist_1_I, kMin1lDistance); } // Objects declared here can be used by all tests in the test case for Foo. - ShapeTable* shape_table_; + ShapeTable *shape_table_; std::unique_ptr master_trainer_; #endif }; @@ -263,18 +256,17 @@ TEST_F(MasterTrainerTest, ErrorCounterTest) { LoadMasterTrainer(); // Add the space character to the shape_table_ if not already present to // count junk. - if (shape_table_->FindShape(0, -1) < 0) shape_table_->AddShape(0, 0); + if (shape_table_->FindShape(0, -1) < 0) + shape_table_->AddShape(0, 0); // Make a mock classifier. auto shape_classifier = std::make_unique(shape_table_); // Get the accuracy report. STRING accuracy_report; - master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0, - false, shape_classifier.get(), - &accuracy_report); + master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0, false, + shape_classifier.get(), &accuracy_report); LOG(INFO) << accuracy_report.c_str(); std::string result_string = accuracy_report.c_str(); - std::vector results = - absl::StrSplit(result_string, '\t', absl::SkipEmpty()); + std::vector results = absl::StrSplit(result_string, '\t', absl::SkipEmpty()); EXPECT_EQ(tesseract::CT_SIZE + 1, results.size()); int result_values[tesseract::CT_SIZE]; for (int i = 0; i < tesseract::CT_SIZE; ++i) { @@ -290,8 +282,7 @@ TEST_F(MasterTrainerTest, ErrorCounterTest) { EXPECT_EQ(kNumTop2Errs, result_values[tesseract::CT_UNICHAR_TOP2_ERR]); EXPECT_EQ(kNumTopNErrs, result_values[tesseract::CT_UNICHAR_TOPN_ERR]); // Each of the TOPTOP errs also counts as a multi-unichar. - EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs, - result_values[tesseract::CT_OK_MULTI_UNICHAR]); + EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs, result_values[tesseract::CT_OK_MULTI_UNICHAR]); EXPECT_EQ(num_samples - kNumNonReject, result_values[tesseract::CT_REJECT]); EXPECT_EQ(kNumAnswers, result_values[tesseract::CT_NUM_RESULTS]); #endif diff --git a/unittest/matrix_test.cc b/unittest/matrix_test.cc index c900308d..c4437eda 100644 --- a/unittest/matrix_test.cc +++ b/unittest/matrix_test.cc @@ -20,7 +20,7 @@ namespace tesseract { class MatrixTest : public ::testing::Test { - protected: +protected: void SetUp() override { std::locale::global(std::locale("")); } @@ -38,7 +38,8 @@ class MatrixTest : public ::testing::Test { for (int i = 0; i < kInputSize_; ++i) { src_.put(0, i, i); } - for (int i = 0; i < kNumDims_; ++i) dims_[i] = 5 - i; + for (int i = 0; i < kNumDims_; ++i) + dims_[i] = 5 - i; } // Number of dimensions in src_. static const int kNumDims_ = 4; @@ -134,4 +135,4 @@ TEST_F(MatrixTest, RotatingTranspose_0_2) { EXPECT_EQ(6, m(15, 0)); } -} // namespace +} // namespace tesseract diff --git a/unittest/networkio_test.cc b/unittest/networkio_test.cc index 3c25f14f..2ced3326 100644 --- a/unittest/networkio_test.cc +++ b/unittest/networkio_test.cc @@ -9,17 +9,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "include_gunit.h" #include "networkio.h" +#include "include_gunit.h" #include "stridemap.h" #ifdef INCLUDE_TENSORFLOW -#include // for xla::Array2D +# include // for xla::Array2D #endif namespace tesseract { class NetworkioTest : public ::testing::Test { - protected: +protected: void SetUp() override { std::locale::global(std::locale("")); } @@ -38,14 +38,13 @@ class NetworkioTest : public ::testing::Test { return a; } // Sets up a NetworkIO with a batch of 2 "images" of known values. - void SetupNetworkIO(NetworkIO* nio) { + void SetupNetworkIO(NetworkIO *nio) { std::vector>> arrays; arrays.push_back(SetupArray(3, 4, 0)); arrays.push_back(SetupArray(4, 5, 12)); std::vector> h_w_sizes; for (size_t i = 0; i < arrays.size(); ++i) { - h_w_sizes.emplace_back(arrays[i].get()->height(), - arrays[i].get()->width()); + h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width()); } StrideMap stride_map; stride_map.SetStride(h_w_sizes); @@ -53,8 +52,7 @@ class NetworkioTest : public ::testing::Test { // Iterate over the map, setting nio's contents from the arrays. StrideMap::Index index(stride_map); do { - int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT), - index.index(FD_WIDTH)); + int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT), index.index(FD_WIDTH)); nio->SetPixel(index.t(), 0, 128 + value, 0.0f, 128.0f); nio->SetPixel(index.t(), 1, 128 - value, 0.0f, 128.0f); } while (index.Increment()); @@ -113,9 +111,9 @@ TEST_F(NetworkioTest, CopyWithYReversal) { StrideMap::Index index(copy.stride_map()); int next_t = 0; int pos = 0; - std::vector expected_values = { - 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 27, 28, 29, 30, - 31, 22, 23, 24, 25, 26, 17, 18, 19, 20, 21, 12, 13, 14, 15, 16}; + std::vector expected_values = {8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, + 3, 27, 28, 29, 30, 31, 22, 23, 24, 25, 26, + 17, 18, 19, 20, 21, 12, 13, 14, 15, 16}; do { int t = index.t(); // The indexed values match the expected values. @@ -150,9 +148,9 @@ TEST_F(NetworkioTest, CopyWithXReversal) { StrideMap::Index index(copy.stride_map()); int next_t = 0; int pos = 0; - std::vector expected_values = { - 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 16, 15, 14, 13, - 12, 21, 20, 19, 18, 17, 26, 25, 24, 23, 22, 31, 30, 29, 28, 27}; + std::vector expected_values = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, + 8, 16, 15, 14, 13, 12, 21, 20, 19, 18, 17, + 26, 25, 24, 23, 22, 31, 30, 29, 28, 27}; do { int t = index.t(); // The indexed values match the expected values. @@ -187,9 +185,9 @@ TEST_F(NetworkioTest, CopyWithXYTranspose) { StrideMap::Index index(copy.stride_map()); int next_t = 0; int pos = 0; - std::vector expected_values = { - 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 17, 22, 27, - 13, 18, 23, 28, 14, 19, 24, 29, 15, 20, 25, 30, 16, 21, 26, 31}; + std::vector expected_values = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, + 11, 12, 17, 22, 27, 13, 18, 23, 28, 14, 19, + 24, 29, 15, 20, 25, 30, 16, 21, 26, 31}; do { int t = index.t(); // The indexed values match the expected values. @@ -214,4 +212,4 @@ TEST_F(NetworkioTest, CopyWithXYTranspose) { #endif } -} // namespace +} // namespace tesseract diff --git a/unittest/normstrngs_test.cc b/unittest/normstrngs_test.cc index c6a35b6a..714f5de8 100644 --- a/unittest/normstrngs_test.cc +++ b/unittest/normstrngs_test.cc @@ -9,13 +9,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "absl/strings/str_format.h" // for absl::StrFormat -#include "include_gunit.h" #include "normstrngs.h" -#include "normstrngs_test.h" #include +#include "absl/strings/str_format.h" // for absl::StrFormat +#include "include_gunit.h" +#include "normstrngs_test.h" #ifdef INCLUDE_TENSORFLOW -#include "util/utf8/unilib.h" // for UniLib +# include "util/utf8/unilib.h" // for UniLib #endif #include "include_gunit.h" @@ -30,58 +30,51 @@ static std::string EncodeAsUTF8(const char32 ch32) { #endif TEST(NormstrngsTest, BasicText) { - const char* kBasicText = "AbCd Ef"; + const char *kBasicText = "AbCd Ef"; std::string result; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, - GraphemeNorm::kNormalize, kBasicText, - &result)); + GraphemeNorm::kNormalize, kBasicText, &result)); EXPECT_STREQ(kBasicText, result.c_str()); } TEST(NormstrngsTest, LigatureText) { - const char* kTwoByteLigText = "ij"; // U+0133 (ij) -> ij + const char *kTwoByteLigText = "ij"; // U+0133 (ij) -> ij std::string result; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, - GraphemeNorm::kNormalize, kTwoByteLigText, - &result)); + GraphemeNorm::kNormalize, kTwoByteLigText, &result)); EXPECT_STREQ("ij", result.c_str()); - const char* kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi + const char *kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, - GraphemeNorm::kNormalize, kThreeByteLigText, - &result)); + GraphemeNorm::kNormalize, kThreeByteLigText, &result)); EXPECT_STREQ("finds", result.c_str()); } TEST(NormstrngsTest, OcrSpecificNormalization) { - const char* kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (') + const char *kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (') std::string result; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, - GraphemeNorm::kNormalize, kSingleQuoteText, - &result)); + GraphemeNorm::kNormalize, kSingleQuoteText, &result)); EXPECT_STREQ("'Hi", result.c_str()); - const char* kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (") + const char *kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (") EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, - GraphemeNorm::kNormalize, kDoubleQuoteText, - &result)); + GraphemeNorm::kNormalize, kDoubleQuoteText, &result)); EXPECT_STREQ("\"Hi", result.c_str()); - const char* kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-) + const char *kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-) EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, GraphemeNorm::kNormalize, kEmDash, &result)); EXPECT_STREQ("Hi-", result.c_str()); // Without the ocr normalization, these changes are not made. - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, kSingleQuoteText, - &result)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize, + kSingleQuoteText, &result)); EXPECT_STREQ(kSingleQuoteText, result.c_str()); - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, kDoubleQuoteText, - &result)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize, + kDoubleQuoteText, &result)); EXPECT_STREQ(kDoubleQuoteText, result.c_str()); - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, kEmDash, &result)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize, + kEmDash, &result)); EXPECT_STREQ(kEmDash, result.c_str()); } @@ -90,38 +83,35 @@ const char kEngText[] = "the quick brown fox jumps over the lazy dog"; const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा"; const char kKorText[] = "이는 것으로"; // Hindi words containing illegal vowel sequences. -const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात", - "कहीअे", "पत्रिाका", "छह्णाीस"}; +const char *kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात", "कहीअे", "पत्रिाका", "छह्णाीस"}; // Thai illegal sequences. -const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"}; +const char *kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"}; TEST(NormstrngsTest, DetectsCorrectText) { std::string chars; - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, kEngText, &chars)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize, + kEngText, &chars)); EXPECT_STREQ(kEngText, chars.c_str()); - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, kHinText, &chars)) + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize, + kHinText, &chars)) << "Incorrect text: '" << kHinText << "'"; EXPECT_STREQ(kHinText, chars.c_str()); - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, kKorText, &chars)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize, + kKorText, &chars)); EXPECT_STREQ(kKorText, chars.c_str()); } TEST(NormstrngsTest, DetectsIncorrectText) { for (size_t i = 0; i < countof(kBadlyFormedHinWords); ++i) { EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, - kBadlyFormedHinWords[i], nullptr)) + GraphemeNorm::kNormalize, kBadlyFormedHinWords[i], nullptr)) << kBadlyFormedHinWords[i]; } for (size_t i = 0; i < countof(kBadlyFormedThaiWords); ++i) { EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, - GraphemeNorm::kNormalize, - kBadlyFormedThaiWords[i], nullptr)) + GraphemeNorm::kNormalize, kBadlyFormedThaiWords[i], nullptr)) << kBadlyFormedThaiWords[i]; } } @@ -129,9 +119,8 @@ TEST(NormstrngsTest, DetectsIncorrectText) { TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) { std::string nonindic = "Here's some latin text."; std::string dest; - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, nonindic.c_str(), - &dest)) + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + nonindic.c_str(), &dest)) << PrintString32WithUnicodes(nonindic); EXPECT_EQ(dest, nonindic); } @@ -140,9 +129,8 @@ TEST(NormstrngsTest, NoLonelyJoiners) { std::string str = "x\u200d\u0d06\u0d34\u0d02"; std::vector glyphs; // Returns true, but the joiner is gone. - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[0], std::string("x")); @@ -154,9 +142,8 @@ TEST(NormstrngsTest, NoLonelyJoinersPlus) { std::string str = "\u0d2a\u200d+\u0d2a\u0d4b"; std::vector glyphs; // Returns true, but the joiner is gone. - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[0], std::string("\u0d2a")); @@ -171,9 +158,8 @@ TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) { str = "\u200d\u200c\u200d"; // Without the plus, the string is invalid. std::string result; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)) + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)) << PrintString32WithUnicodes(result); } @@ -184,14 +170,14 @@ TEST(NormstrngsTest, JoinersStayInArabic) { } TEST(NormstrngsTest, DigitOK) { - std::string str = "\u0cea"; // Digit 4. + std::string str = "\u0cea"; // Digit 4. ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str); } TEST(NormstrngsTest, DandaOK) { - std::string str = "\u0964"; // Single danda. + std::string str = "\u0964"; // Single danda. ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str); - str = "\u0965"; // Double danda. + str = "\u0965"; // Double danda. ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str); } @@ -312,7 +298,7 @@ TEST(NormstrngsTest, AllScriptsRegtest) { "hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng " "Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}}); - for (const auto& p : kScriptText) { + for (const auto &p : kScriptText) { std::string normalized; EXPECT_TRUE(tesseract::NormalizeUTF8String( tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, @@ -385,8 +371,7 @@ TEST(NormstrngsTest, IsInterchangeValid7BitAscii) { for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch)); std::string str = EncodeAsUTF8(ch); - EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str), - IsInterchangeValid7BitAscii(ch)); + EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str), IsInterchangeValid7BitAscii(ch)); } #else // Skipped because of missing UniLib::IsInterchangeValid7BitAscii. @@ -409,7 +394,8 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) { const int32_t kMinUnicodeValue = 33; const int32_t kMaxUnicodeValue = 0x10FFFF; for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { - if (!IsValidCodepoint(ch)) continue; + if (!IsValidCodepoint(ch)) + continue; SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch)); std::string str = EncodeAsUTF8(ch); const std::string expected_half_str = @@ -419,4 +405,4 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) { #endif } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/normstrngs_test.h b/unittest/normstrngs_test.h index 3b459348..80577f73 100644 --- a/unittest/normstrngs_test.h +++ b/unittest/normstrngs_test.h @@ -12,16 +12,16 @@ #ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ #define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ -#include // for std::stringstream +#include +#include // for std::stringstream #include #include #include "absl/strings/str_cat.h" #include "absl/strings/str_join.h" -#include namespace tesseract { -inline std::string CodepointList(const std::vector& str32) { +inline std::string CodepointList(const std::vector &str32) { std::stringstream result; int total_chars = str32.size(); result << std::hex; @@ -31,54 +31,46 @@ inline std::string CodepointList(const std::vector& str32) { return result.str(); } -inline std::string PrintString32WithUnicodes(const std::string& str) { +inline std::string PrintString32WithUnicodes(const std::string &str) { std::vector str32 = UNICHAR::UTF8ToUTF32(str.c_str()); return absl::StrCat("\"", str, "\" ", CodepointList(str32)); } -inline std::string PrintStringVectorWithUnicodes(const std::vector& glyphs) { +inline std::string PrintStringVectorWithUnicodes(const std::vector &glyphs) { std::string result; - for (const auto& s : glyphs) { + for (const auto &s : glyphs) { result += "Glyph:"; result += PrintString32WithUnicodes(s) + "\n"; } return result; } -inline void ExpectGraphemeModeResults(const std::string& str, UnicodeNormMode u_mode, - int unicode_count, int glyph_count, - int grapheme_count, - const std::string& target_str) { +inline void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode, + int unicode_count, int glyph_count, int grapheme_count, + const std::string &target_str) { std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true, - str.c_str(), &glyphs)); - EXPECT_EQ(glyphs.size(), unicode_count) - << PrintStringVectorWithUnicodes(glyphs); + u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), unicode_count) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), "")); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, - GraphemeNormMode::kGlyphSplit, true, - str.c_str(), &glyphs)); - EXPECT_EQ(glyphs.size(), glyph_count) - << PrintStringVectorWithUnicodes(glyphs); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), glyph_count) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), "")); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, - GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)); - EXPECT_EQ(glyphs.size(), grapheme_count) - << PrintStringVectorWithUnicodes(glyphs); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kCombined, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), grapheme_count) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), "")); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, - GraphemeNormMode::kSingleString, + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kSingleString, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(target_str, glyphs[0]); std::string result; - EXPECT_TRUE(NormalizeUTF8String( - u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); + EXPECT_TRUE( + NormalizeUTF8String(u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); EXPECT_EQ(target_str, result); } -} // namespace tesseract +} // namespace tesseract -#endif // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ +#endif // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ diff --git a/unittest/nthitem_test.cc b/unittest/nthitem_test.cc index 9f9e9da8..af8bf854 100644 --- a/unittest/nthitem_test.cc +++ b/unittest/nthitem_test.cc @@ -19,15 +19,15 @@ int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0, -32767, 6, 7}; // The fixture for testing GenericHeap and DoublePtr. class NthItemTest : public testing::Test { - protected: +protected: void SetUp() override { std::locale::global(std::locale("")); } - public: +public: virtual ~NthItemTest(); // Pushes the test data onto the KDVector. - void PushTestData(KDVector* v) { + void PushTestData(KDVector *v) { for (size_t i = 0; i < countof(test_data); ++i) { IntKDPair pair(test_data[i], i); v->push_back(pair); @@ -117,4 +117,4 @@ TEST_F(NthItemTest, EqualTest) { EXPECT_TRUE(v[index].data() == 4 || v[index].data() == 12); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/osd_test.cc b/unittest/osd_test.cc index a45ec1b9..8ff1227d 100644 --- a/unittest/osd_test.cc +++ b/unittest/osd_test.cc @@ -19,34 +19,33 @@ // expects clones of tessdata, tessdata_fast and tessdata_best repos //#include "log.h" -#include -#include // std::unique_ptr -#include -#include -#include "include_gunit.h" #include +#include +#include +#include // std::unique_ptr +#include +#include "include_gunit.h" namespace tesseract { class TestClass : public testing::Test { - protected: +protected: }; #ifndef DISABLED_LEGACY_ENGINE -static void OSDTester(int expected_deg, const char* imgname, const char* tessdatadir) { +static void OSDTester(int expected_deg, const char *imgname, const char *tessdatadir) { // log.info() << tessdatadir << " for image: " << imgname << std::endl; std::unique_ptr api(new tesseract::TessBaseAPI()); - ASSERT_FALSE(api->Init(tessdatadir, "osd")) - << "Could not initialize tesseract."; - Pix* image = pixRead(imgname); + ASSERT_FALSE(api->Init(tessdatadir, "osd")) << "Could not initialize tesseract."; + Pix *image = pixRead(imgname); ASSERT_TRUE(image != nullptr) << "Failed to read test image."; api->SetImage(image); int orient_deg; float orient_conf; - const char* script_name; + const char *script_name; float script_conf; - bool detected = api->DetectOrientationScript(&orient_deg, &orient_conf, - &script_name, &script_conf); + bool detected = + api->DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf); ASSERT_FALSE(!detected) << "Failed to detect OSD."; printf( "************ Orientation in degrees: %d, Orientation confidence: %.2f\n" @@ -59,75 +58,66 @@ static void OSDTester(int expected_deg, const char* imgname, const char* tessdat #endif class OSDTest : public TestClass, - public ::testing::WithParamInterface< - std::tuple> {}; + public ::testing::WithParamInterface> { +}; TEST_P(OSDTest, MatchOrientationDegrees) { #ifdef DISABLED_LEGACY_ENGINE // Skip test because TessBaseAPI::DetectOrientationScript is missing. GTEST_SKIP(); #else - OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()), - std::get<2>(GetParam())); + OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()), std::get<2>(GetParam())); #endif } -INSTANTIATE_TEST_SUITE_P( - TessdataEngEuroHebrew, OSDTest, - ::testing::Combine(::testing::Values(0), - ::testing::Values(TESTING_DIR "/phototest.tif", - TESTING_DIR "/eurotext.tif", - TESTING_DIR "/hebrew.png"), - ::testing::Values(TESSDATA_DIR))); +INSTANTIATE_TEST_SUITE_P(TessdataEngEuroHebrew, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/phototest.tif", + TESTING_DIR "/eurotext.tif", + TESTING_DIR "/hebrew.png"), + ::testing::Values(TESSDATA_DIR))); -INSTANTIATE_TEST_SUITE_P( - TessdataBestEngEuroHebrew, OSDTest, - ::testing::Combine(::testing::Values(0), - ::testing::Values(TESTING_DIR "/phototest.tif", - TESTING_DIR "/eurotext.tif", - TESTING_DIR "/hebrew.png"), - ::testing::Values(TESSDATA_DIR "_best"))); +INSTANTIATE_TEST_SUITE_P(TessdataBestEngEuroHebrew, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/phototest.tif", + TESTING_DIR "/eurotext.tif", + TESTING_DIR "/hebrew.png"), + ::testing::Values(TESSDATA_DIR "_best"))); -INSTANTIATE_TEST_SUITE_P( - TessdataFastEngEuroHebrew, OSDTest, - ::testing::Combine(::testing::Values(0), - ::testing::Values(TESTING_DIR "/phototest.tif", - TESTING_DIR "/eurotext.tif", - TESTING_DIR "/hebrew.png"), - ::testing::Values(TESSDATA_DIR "_fast"))); +INSTANTIATE_TEST_SUITE_P(TessdataFastEngEuroHebrew, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/phototest.tif", + TESTING_DIR "/eurotext.tif", + TESTING_DIR "/hebrew.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); -INSTANTIATE_TEST_SUITE_P( - TessdataFastRotated90, OSDTest, - ::testing::Combine(::testing::Values(90), - ::testing::Values(TESTING_DIR - "/phototest-rotated-R.png"), - ::testing::Values(TESSDATA_DIR "_fast"))); +INSTANTIATE_TEST_SUITE_P(TessdataFastRotated90, OSDTest, + ::testing::Combine(::testing::Values(90), + ::testing::Values(TESTING_DIR + "/phototest-rotated-R.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); -INSTANTIATE_TEST_SUITE_P( - TessdataFastRotated180, OSDTest, - ::testing::Combine(::testing::Values(180), - ::testing::Values(TESTING_DIR - "/phototest-rotated-180.png"), - ::testing::Values(TESSDATA_DIR "_fast"))); +INSTANTIATE_TEST_SUITE_P(TessdataFastRotated180, OSDTest, + ::testing::Combine(::testing::Values(180), + ::testing::Values(TESTING_DIR + "/phototest-rotated-180.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); -INSTANTIATE_TEST_SUITE_P( - TessdataFastRotated270, OSDTest, - ::testing::Combine(::testing::Values(270), - ::testing::Values(TESTING_DIR - "/phototest-rotated-L.png"), - ::testing::Values(TESSDATA_DIR "_fast"))); +INSTANTIATE_TEST_SUITE_P(TessdataFastRotated270, OSDTest, + ::testing::Combine(::testing::Values(270), + ::testing::Values(TESTING_DIR + "/phototest-rotated-L.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); -INSTANTIATE_TEST_SUITE_P( - TessdataFastDevaRotated270, OSDTest, - ::testing::Combine(::testing::Values(270), - ::testing::Values(TESTING_DIR - "/devatest-rotated-270.png"), - ::testing::Values(TESSDATA_DIR "_fast"))); +INSTANTIATE_TEST_SUITE_P(TessdataFastDevaRotated270, OSDTest, + ::testing::Combine(::testing::Values(270), + ::testing::Values(TESTING_DIR + "/devatest-rotated-270.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); -INSTANTIATE_TEST_SUITE_P( - TessdataFastDeva, OSDTest, - ::testing::Combine(::testing::Values(0), - ::testing::Values(TESTING_DIR "/devatest.png"), - ::testing::Values(TESSDATA_DIR "_fast"))); +INSTANTIATE_TEST_SUITE_P(TessdataFastDeva, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/devatest.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); -} // namespace +} // namespace tesseract diff --git a/unittest/pagesegmode_test.cc b/unittest/pagesegmode_test.cc index 5cdd4299..fc48dde2 100644 --- a/unittest/pagesegmode_test.cc +++ b/unittest/pagesegmode_test.cc @@ -10,21 +10,21 @@ // limitations under the License. #if defined(_WIN32) -#include // for _access +# include // for _access #else -#include // for access +# include // for access #endif -#include #include #include +#include #include "helpers.h" -#include "log.h" #include "include_gunit.h" +#include "log.h" namespace tesseract { // Replacement for std::filesystem::exists (C++-17) -static bool file_exists(const char* filename) { +static bool file_exists(const char *filename) { #if defined(_WIN32) return _access(filename, 0) == 0; #else @@ -34,7 +34,7 @@ static bool file_exists(const char* filename) { // The fixture for testing Tesseract. class PageSegModeTest : public testing::Test { - protected: +protected: PageSegModeTest() = default; ~PageSegModeTest() { pixDestroy(&src_pix_); @@ -45,7 +45,7 @@ class PageSegModeTest : public testing::Test { std::locale::global(system_locale); } - void SetImage(const char* filename) { + void SetImage(const char *filename) { pixDestroy(&src_pix_); src_pix_ = pixRead(filename); api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY); @@ -54,11 +54,11 @@ class PageSegModeTest : public testing::Test { // Tests that the given rectangle produces exactly the given text in the // given segmentation mode (after chopping off the last 2 newlines.) - void VerifyRectText(tesseract::PageSegMode mode, const char* str, - int left, int top, int width, int height) { + void VerifyRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width, + int height) { api_.SetPageSegMode(mode); api_.SetRectangle(left, top, width, height); - char* result = api_.GetUTF8Text(); + char *result = api_.GetUTF8Text(); chomp_string(result); chomp_string(result); EXPECT_STREQ(str, result); @@ -67,16 +67,16 @@ class PageSegModeTest : public testing::Test { // Tests that the given rectangle does NOT produce the given text in the // given segmentation mode. - void NotRectText(tesseract::PageSegMode mode, const char* str, - int left, int top, int width, int height) { + void NotRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width, + int height) { api_.SetPageSegMode(mode); api_.SetRectangle(left, top, width, height); - char* result = api_.GetUTF8Text(); + char *result = api_.GetUTF8Text(); EXPECT_STRNE(str, result); delete[] result; } - Pix* src_pix_ = nullptr; + Pix *src_pix_ = nullptr; std::string ocr_text_; tesseract::TessBaseAPI api_; }; @@ -95,20 +95,15 @@ TEST_F(PageSegModeTest, WordTest) { VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1411, 252, 78, 62); VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1396, 218, 114, 102); // Test a random pair of words as a line - VerifyRectText(tesseract::PSM_SINGLE_LINE, - "What should", 237, 393, 256, 36); + VerifyRectText(tesseract::PSM_SINGLE_LINE, "What should", 237, 393, 256, 36); // Test a random pair of words as a word - VerifyRectText(tesseract::PSM_SINGLE_WORD, - "Whatshould", 237, 393, 256, 36); + VerifyRectText(tesseract::PSM_SINGLE_WORD, "Whatshould", 237, 393, 256, 36); // Test single block mode. - VerifyRectText(tesseract::PSM_SINGLE_BLOCK, - "both the\nfrom the", 237, 450, 172, 94); + VerifyRectText(tesseract::PSM_SINGLE_BLOCK, "both the\nfrom the", 237, 450, 172, 94); // But doesn't work in line or word mode. - NotRectText(tesseract::PSM_SINGLE_LINE, - "both the\nfrom the", 237, 450, 172, 94); - NotRectText(tesseract::PSM_SINGLE_WORD, - "both the\nfrom the", 237, 450, 172, 94); + NotRectText(tesseract::PSM_SINGLE_LINE, "both the\nfrom the", 237, 450, 172, 94); + NotRectText(tesseract::PSM_SINGLE_WORD, "both the\nfrom the", 237, 450, 172, 94); } } -} // namespace +} // namespace tesseract diff --git a/unittest/pango_font_info_test.cc b/unittest/pango_font_info_test.cc index e5a6f8f2..d15cb180 100644 --- a/unittest/pango_font_info_test.cc +++ b/unittest/pango_font_info_test.cc @@ -9,35 +9,33 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "pango_font_info.h" +#include #include #include -#include -#include "include_gunit.h" +#include "absl/strings/str_cat.h" // for absl::StrCat #include "commandlineflags.h" #include "fileio.h" -#include "pango_font_info.h" -#include "absl/strings/str_cat.h" // for absl::StrCat -#include "gmock/gmock-matchers.h" // for EXPECT_THAT +#include "gmock/gmock-matchers.h" // for EXPECT_THAT +#include "include_gunit.h" #ifdef INCLUDE_TENSORFLOW -#include "util/utf8/unicodetext.h" // for UnicodeText +# include "util/utf8/unicodetext.h" // for UnicodeText #endif namespace tesseract { // Fonts in testdata directory -const char* kExpectedFontNames[] = { - "Arab", - "Arial Bold Italic", - "DejaVu Sans Ultra-Light", - "Lohit Hindi", +const char *kExpectedFontNames[] = {"Arab", + "Arial Bold Italic", + "DejaVu Sans Ultra-Light", + "Lohit Hindi", #if PANGO_VERSION <= 12005 - "Times New Roman", + "Times New Roman", #else - "Times New Roman,", // Pango v1.36.2 requires a trailing ',' + "Times New Roman,", // Pango v1.36.2 requires a trailing ',' #endif - "UnBatang", - "Verdana" -}; + "UnBatang", + "Verdana"}; // Sample text used in tests. const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع"; @@ -45,18 +43,17 @@ const char kEngText[] = "the quick brown fox jumps over the lazy dog"; const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा"; const char kKorText[] = "이는 것으로"; // Hindi words containing illegal vowel sequences. -const char* kBadlyFormedHinWords[] = { +const char *kBadlyFormedHinWords[] = { #if PANGO_VERSION <= 12005 - "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस", + "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस", #endif - // Pango v1.36.2 will render the above words even though they are invalid. - "प्रंात", nullptr -}; + // Pango v1.36.2 will render the above words even though they are invalid. + "प्रंात", nullptr}; -static PangoFontMap* font_map; +static PangoFontMap *font_map; class PangoFontInfoTest : public ::testing::Test { - protected: +protected: void SetUp() override { if (!font_map) { font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); @@ -135,8 +132,7 @@ TEST_F(PangoFontInfoTest, CanRenderString) { TEST_F(PangoFontInfoTest, CanRenderLigature) { font_info_.ParseFontDescriptionName("Arab 12"); const char kArabicLigature[] = "لا"; - EXPECT_TRUE( - font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature))); + EXPECT_TRUE(font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature))); printf("Next word\n"); EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText))); @@ -150,8 +146,8 @@ TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) { TEST_F(PangoFontInfoTest, CannotRenderInvalidString) { font_info_.ParseFontDescriptionName("Lohit Hindi 12"); for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) { - EXPECT_FALSE(font_info_.CanRenderString(kBadlyFormedHinWords[i], - strlen(kBadlyFormedHinWords[i]))) + EXPECT_FALSE( + font_info_.CanRenderString(kBadlyFormedHinWords[i], strlen(kBadlyFormedHinWords[i]))) << "Can render " << kBadlyFormedHinWords[i]; } } @@ -164,10 +160,10 @@ TEST_F(PangoFontInfoTest, CanDropUncoveredChars) { EXPECT_EQ("oice", word); // Don't drop non-letter characters like word joiners. - const char* kJoiners[] = { - "\u2060", // U+2060 (WJ) - "\u200C", // U+200C (ZWJ) - "\u200D" // U+200D (ZWNJ) + const char *kJoiners[] = { + "\u2060", // U+2060 (WJ) + "\u200C", // U+200C (ZWJ) + "\u200D" // U+200D (ZWNJ) }; for (size_t i = 0; i < countof(kJoiners); ++i) { word = kJoiners[i]; @@ -179,7 +175,7 @@ TEST_F(PangoFontInfoTest, CanDropUncoveredChars) { // ------------------------ FontUtils ------------------------------------ class FontUtilsTest : public ::testing::Test { - protected: +protected: void SetUp() override { file::MakeTmpdir(); } @@ -195,17 +191,17 @@ class FontUtilsTest : public ::testing::Test { } #ifdef INCLUDE_TENSORFLOW - void CountUnicodeChars(const char* utf8_text, - std::unordered_map* ch_map) { + void CountUnicodeChars(const char *utf8_text, std::unordered_map *ch_map) { ch_map->clear(); UnicodeText ut; ut.PointToUTF8(utf8_text, strlen(utf8_text)); for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) { -#if 0 +# if 0 if (UnicodeProps::IsWhitespace(*it)) continue; -#else - if (std::isspace(*it)) continue; -#endif +# else + if (std::isspace(*it)) + continue; +# endif ++(*ch_map)[*it]; } } @@ -235,9 +231,9 @@ TEST_F(FontUtilsTest, DoesDetectMissingFonts) { } TEST_F(FontUtilsTest, DoesListAvailableFonts) { - const std::vector& fonts = FontUtils::ListAvailableFonts(); + const std::vector &fonts = FontUtils::ListAvailableFonts(); EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames)); - for (auto& font : fonts) { + for (auto &font : fonts) { PangoFontInfo font_info; EXPECT_TRUE(font_info.ParseFontDescriptionName(font)); } @@ -248,8 +244,8 @@ TEST_F(FontUtilsTest, DoesFindBestFonts) { std::string fonts_list; std::unordered_map ch_map; CountUnicodeChars(kEngText, &ch_map); - EXPECT_EQ(26, ch_map.size()); // 26 letters - std::vector > > font_flags; + EXPECT_EQ(26, ch_map.size()); // 26 letters + std::vector > > font_flags; std::string best_list = FontUtils::BestFonts(ch_map, &font_flags); EXPECT_TRUE(best_list.size()); // All fonts except Lohit Hindi should render English text. @@ -265,14 +261,14 @@ TEST_F(FontUtilsTest, DoesFindBestFonts) { #endif TEST_F(FontUtilsTest, DoesSelectFont) { - const char* kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr}; - const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr}; + const char *kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr}; + const char *kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr}; for (int i = 0; kLangText[i] != nullptr; ++i) { SCOPED_TRACE(kLangNames[i]); std::vector graphemes; std::string selected_font; - EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), - &selected_font, &graphemes)); + EXPECT_TRUE( + FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), &selected_font, &graphemes)); EXPECT_TRUE(selected_font.size()); EXPECT_TRUE(graphemes.size()); } @@ -282,8 +278,8 @@ TEST_F(FontUtilsTest, DoesFailToSelectFont) { const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع"; std::vector graphemes; std::string selected_font; - EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), - &selected_font, &graphemes)); + EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), &selected_font, + &graphemes)); } #if 0 @@ -301,9 +297,9 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) { EXPECT_TRUE(unicode_mask[kHindiChar]); EXPECT_TRUE(unicode_mask[kArabicChar]); EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian. -#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham +# if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham. -#endif +# endif unicode_mask.clear(); std::vector selected_fonts; @@ -322,13 +318,13 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) { for (size_t f = 0; f < countof(kExpectedFontNames); ++f) { SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f])); FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask); -#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham +# if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham EXPECT_FALSE(unicode_mask[kOghamChar]); -#endif +# endif EXPECT_FALSE(unicode_mask[kMongolianChar]); unicode_mask.clear(); } } #endif -} // namespace +} // namespace tesseract diff --git a/unittest/paragraphs_test.cc b/unittest/paragraphs_test.cc index e58bdd9e..7fc677e9 100644 --- a/unittest/paragraphs_test.cc +++ b/unittest/paragraphs_test.cc @@ -9,14 +9,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // for std::string +#include // for std::string -#include "absl/strings/str_cat.h" // for absl::StrCat -#include "absl/strings/str_join.h" // for absl::StrJoin -#include "absl/strings/str_split.h" // for absl::StrSplit +#include "absl/strings/str_cat.h" // for absl::StrCat +#include "absl/strings/str_join.h" // for absl::StrJoin +#include "absl/strings/str_split.h" // for absl::StrSplit -#include "include_gunit.h" // for TEST -#include "log.h" // for LOG +#include "include_gunit.h" // for TEST +#include "log.h" // for LOG #include "genericvector.h" // ccmain @@ -34,13 +34,13 @@ const ParagraphJustification kRight = JUSTIFICATION_RIGHT; const ParagraphJustification kUnknown = JUSTIFICATION_UNKNOWN; enum TextModelInputType { - PCONT = 0, // Continuation line of a paragraph (default). - PSTART = 1, // First line of a paragraph. - PNONE = 2, // Not a paragraph line. + PCONT = 0, // Continuation line of a paragraph (default). + PSTART = 1, // First line of a paragraph. + PNONE = 2, // Not a paragraph line. }; struct TextAndModel { - const char* ascii; + const char *ascii; TextModelInputType model_type; // fields corresponding to PARA (see ccstruct/ocrpara.h) @@ -51,12 +51,11 @@ struct TextAndModel { // Imagine that the given text is typewriter ASCII with each character ten // pixels wide and twenty pixels high and return an appropriate row_info. -void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) { +void AsciiToRowInfo(const char *text, int row_number, RowInfo *info) { const int kCharWidth = 10; const int kLineSpace = 30; info->text = text; - info->has_leaders = - strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr; + info->has_leaders = strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr; info->has_drop_cap = false; info->pix_ldistance = info->pix_rdistance = 0; info->average_interword_space = kCharWidth; @@ -66,7 +65,8 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) { std::vector words = absl::StrSplit(text, ' ', absl::SkipEmpty()); info->num_words = words.size(); - if (info->num_words < 1) return; + if (info->num_words < 1) + return; info->lword_text = words[0].c_str(); info->rword_text = words[words.size() - 1].c_str(); @@ -75,8 +75,7 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) { lspace++; } int rspace = 0; - while (rspace < info->text.size() && - text[info->text.size() - rspace - 1] == ' ') { + while (rspace < info->text.size() && text[info->text.size() - rspace - 1] == ' ') { rspace++; } @@ -87,20 +86,16 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) { int rword_width = kCharWidth * info->rword_text.size(); info->pix_ldistance = lspace * kCharWidth; info->pix_rdistance = rspace * kCharWidth; - info->lword_box = - TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top); + info->lword_box = TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top); info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom, row_right - info->pix_rdistance, top); - LeftWordAttributes( - nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item, - &info->lword_likely_starts_idea, &info->lword_likely_ends_idea); - RightWordAttributes( - nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item, - &info->rword_likely_starts_idea, &info->rword_likely_ends_idea); + LeftWordAttributes(nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item, + &info->lword_likely_starts_idea, &info->lword_likely_ends_idea); + RightWordAttributes(nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item, + &info->rword_likely_starts_idea, &info->rword_likely_ends_idea); } -void MakeAsciiRowInfos(const TextAndModel* row_infos, int n, - std::vector* output) { +void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector *output) { output->clear(); RowInfo info; for (int i = 0; i < n; i++) { @@ -111,8 +106,8 @@ void MakeAsciiRowInfos(const TextAndModel* row_infos, int n, // Given n rows of reference ground truth, evaluate whether the n rows // of PARA * pointers yield the same paragraph breakpoints. -void EvaluateParagraphDetection(const TextAndModel* correct, int n, - const GenericVector& detector_output) { +void EvaluateParagraphDetection(const TextAndModel *correct, int n, + const GenericVector &detector_output) { int incorrect_breaks = 0; int missed_breaks = 0; int poorly_matched_models = 0; @@ -122,8 +117,10 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n, for (int i = 1; i < n; i++) { bool has_break = correct[i].model_type != PCONT; bool detected_break = (detector_output[i - 1] != detector_output[i]); - if (has_break && !detected_break) missed_breaks++; - if (detected_break && !has_break) incorrect_breaks++; + if (has_break && !detected_break) + missed_breaks++; + if (detected_break && !has_break) + incorrect_breaks++; if (has_break) { if (correct[i].model_type == PNONE) { if (detector_output[i]->model != nullptr) { @@ -150,18 +147,17 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n, EXPECT_EQ(poorly_matched_models, 0); EXPECT_EQ(bad_list_items, 0); EXPECT_EQ(bad_crowns, 0); - if (incorrect_breaks || missed_breaks || poorly_matched_models || - bad_list_items || bad_crowns) { + if (incorrect_breaks || missed_breaks || poorly_matched_models || bad_list_items || bad_crowns) { std::vector dbg_lines; dbg_lines.push_back("# =========================="); dbg_lines.push_back("# Correct paragraph breaks:"); dbg_lines.push_back("# =========================="); for (int i = 0; i < n; i++) { if (correct[i].model_type != PCONT) { - dbg_lines.push_back(absl::StrCat( - correct[i].ascii, " # ", correct[i].model.ToString().c_str(), - correct[i].is_very_first_or_continuation ? " crown" : "", - correct[i].is_list_item ? " li" : "")); + dbg_lines.push_back(absl::StrCat(correct[i].ascii, " # ", + correct[i].model.ToString().c_str(), + correct[i].is_very_first_or_continuation ? " crown" : "", + correct[i].is_list_item ? " li" : "")); } else { dbg_lines.push_back(correct[i].ascii); } @@ -174,10 +170,10 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n, std::string annotation; if (i == 0 || (detector_output[i - 1] != detector_output[i])) { if (detector_output[i] && detector_output[i]->model) { - annotation += absl::StrCat( - " # ", detector_output[i]->model->ToString().c_str(), - detector_output[i]->is_very_first_or_continuation ? " crown" : "", - detector_output[i]->is_list_item ? " li" : ""); + annotation += + absl::StrCat(" # ", detector_output[i]->model->ToString().c_str(), + detector_output[i]->is_very_first_or_continuation ? " crown" : "", + detector_output[i]->is_list_item ? " li" : ""); } else { annotation = " # Unmodeled paragraph."; } @@ -188,18 +184,17 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n, } } -void TestParagraphDetection(const TextAndModel* correct, int num_rows) { +void TestParagraphDetection(const TextAndModel *correct, int num_rows) { std::vector row_infos; - GenericVector row_owners; + GenericVector row_owners; PARA_LIST paragraphs; - std::vector models; + std::vector models; MakeAsciiRowInfos(correct, num_rows, &row_infos); int debug_level(3); - tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, ¶graphs, - &models); + tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, ¶graphs, &models); EvaluateParagraphDetection(correct, num_rows, row_owners); - for (auto* model : models) { + for (auto *model : models) { delete model; } } @@ -230,51 +225,47 @@ TEST(ParagraphsTest, ListItemsIdentified) { typedef ParagraphModel PModel; const TextAndModel kTwoSimpleParagraphs[] = { - {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"This paragraph starts at the top", PCONT, PModel(), false, false}, - {"of the page and takes 3 lines. ", PCONT, PModel(), false, false}, - {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"which indicates that the first ", PCONT, PModel(), false, false}, - {"paragraph is not a continuation ", PCONT, PModel(), false, false}, - {"from a previous page, as it is ", PCONT, PModel(), false, false}, - {"indented just like this second ", PCONT, PModel(), false, false}, - {"paragraph. ", PCONT, PModel(), false, false}, + {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"This paragraph starts at the top", PCONT, PModel(), false, false}, + {"of the page and takes 3 lines. ", PCONT, PModel(), false, false}, + {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is not a continuation ", PCONT, PModel(), false, false}, + {"from a previous page, as it is ", PCONT, PModel(), false, false}, + {"indented just like this second ", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestSimpleParagraphDetection) { - TestParagraphDetection(kTwoSimpleParagraphs, - countof(kTwoSimpleParagraphs)); + TestParagraphDetection(kTwoSimpleParagraphs, countof(kTwoSimpleParagraphs)); } const TextAndModel kFewCluesWithCrown[] = { - {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0), - true, false}, - {"of the page and takes two lines.", PCONT, PModel(), false, false}, - {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"which indicates that the first ", PCONT, PModel(), false, false}, - {"paragraph is a continuation from", PCONT, PModel(), false, false}, - {"a previous page, as it is ", PCONT, PModel(), false, false}, - {"indented just like this second ", PCONT, PModel(), false, false}, - {"paragraph. ", PCONT, PModel(), false, false}, + {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false}, + {"of the page and takes two lines.", PCONT, PModel(), false, false}, + {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is a continuation from", PCONT, PModel(), false, false}, + {"a previous page, as it is ", PCONT, PModel(), false, false}, + {"indented just like this second ", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestFewCluesWithCrown) { - TestParagraphDetection(kFewCluesWithCrown, - countof(kFewCluesWithCrown)); + TestParagraphDetection(kFewCluesWithCrown, countof(kFewCluesWithCrown)); } const TextAndModel kCrownedParagraph[] = { - {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), - true, false}, - {"often not indented as the rest ", PCONT, PModel(), false, false}, - {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false}, - {"less it should be counted as the", PCONT, PModel(), false, false}, - {"same type of paragraph. ", PCONT, PModel(), false, false}, - {" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"graphs are both indented two ", PCONT, PModel(), false, false}, - {"spaces. ", PCONT, PModel(), false, false}, - {" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false}, + {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false}, + {"often not indented as the rest ", PCONT, PModel(), false, false}, + {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false}, + {"less it should be counted as the", PCONT, PModel(), false, false}, + {"same type of paragraph. ", PCONT, PModel(), false, false}, + {" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"graphs are both indented two ", PCONT, PModel(), false, false}, + {"spaces. ", PCONT, PModel(), false, false}, + {" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestCrownParagraphDetection) { @@ -282,67 +273,66 @@ TEST(ParagraphsTest, TestCrownParagraphDetection) { } const TextAndModel kFlushLeftParagraphs[] = { - {"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false}, - {"flush left paragraphs (those", PCONT, PModel(), false, false}, - {"with no body indent) are not", PCONT, PModel(), false, false}, - {"actually crowns. ", PCONT, PModel(), false, false}, - {"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false}, - {"also flush left aligned. Usual-", PCONT, PModel(), false, false}, - {"ly, these paragraphs are set", PCONT, PModel(), false, false}, - {"apart vertically by some white-", PCONT, PModel(), false, false}, - {"space, but you can also detect", PCONT, PModel(), false, false}, - {"them by observing the big empty", PCONT, PModel(), false, false}, - {"space at the ends of the para-", PCONT, PModel(), false, false}, - {"graphs. ", PCONT, PModel(), false, false}, + {"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false}, + {"flush left paragraphs (those", PCONT, PModel(), false, false}, + {"with no body indent) are not", PCONT, PModel(), false, false}, + {"actually crowns. ", PCONT, PModel(), false, false}, + {"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false}, + {"also flush left aligned. Usual-", PCONT, PModel(), false, false}, + {"ly, these paragraphs are set", PCONT, PModel(), false, false}, + {"apart vertically by some white-", PCONT, PModel(), false, false}, + {"space, but you can also detect", PCONT, PModel(), false, false}, + {"them by observing the big empty", PCONT, PModel(), false, false}, + {"space at the ends of the para-", PCONT, PModel(), false, false}, + {"graphs. ", PCONT, PModel(), false, false}, }; TEST(ParagraphsText, TestRealFlushLeftParagraphs) { - TestParagraphDetection(kFlushLeftParagraphs, - countof(kFlushLeftParagraphs)); + TestParagraphDetection(kFlushLeftParagraphs, countof(kFlushLeftParagraphs)); } const TextAndModel kSingleFullPageContinuation[] = { - {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false}, - {"continuation. It flows from", PCONT, PModel(), false, false}, - {"line to line, using the full", PCONT, PModel(), false, false}, - {"column width with no clear", PCONT, PModel(), false, false}, - {"paragraph break, because it", PCONT, PModel(), false, false}, - {"actually doesn't have one. It", PCONT, PModel(), false, false}, - {"is the middle of one monster", PCONT, PModel(), false, false}, - {"paragraph continued from the", PCONT, PModel(), false, false}, - {"previous page and continuing", PCONT, PModel(), false, false}, - {"onto the next page. There-", PCONT, PModel(), false, false}, - {"fore, it ends up getting", PCONT, PModel(), false, false}, - {"marked as a crown and then", PCONT, PModel(), false, false}, - {"getting re-marked as any ex-", PCONT, PModel(), false, false}, - {"isting model. Not great, but", PCONT, PModel(), false, false}, + {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false}, + {"continuation. It flows from", PCONT, PModel(), false, false}, + {"line to line, using the full", PCONT, PModel(), false, false}, + {"column width with no clear", PCONT, PModel(), false, false}, + {"paragraph break, because it", PCONT, PModel(), false, false}, + {"actually doesn't have one. It", PCONT, PModel(), false, false}, + {"is the middle of one monster", PCONT, PModel(), false, false}, + {"paragraph continued from the", PCONT, PModel(), false, false}, + {"previous page and continuing", PCONT, PModel(), false, false}, + {"onto the next page. There-", PCONT, PModel(), false, false}, + {"fore, it ends up getting", PCONT, PModel(), false, false}, + {"marked as a crown and then", PCONT, PModel(), false, false}, + {"getting re-marked as any ex-", PCONT, PModel(), false, false}, + {"isting model. Not great, but", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestSingleFullPageContinuation) { - const TextAndModel* correct = kSingleFullPageContinuation; + const TextAndModel *correct = kSingleFullPageContinuation; int num_rows = countof(kSingleFullPageContinuation); std::vector row_infos; - GenericVector row_owners; + GenericVector row_owners; PARA_LIST paragraphs; - std::vector models; + std::vector models; models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10)); MakeAsciiRowInfos(correct, num_rows, &row_infos); tesseract::DetectParagraphs(3, &row_infos, &row_owners, ¶graphs, &models); EvaluateParagraphDetection(correct, num_rows, row_owners); - for (auto* model : models) { + for (auto *model : models) { delete model; } } const TextAndModel kRightAligned[] = { - {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false}, - {" uncommon in Left-to-Right", PCONT, PModel(), false, false}, - {" languages, but they do", PCONT, PModel(), false, false}, - {" exist.", PCONT, PModel(), false, false}, - {" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false}, - {" horribly tiny paragraphs in", PCONT, PModel(), false, false}, - {" tables on which we have no", PCONT, PModel(), false, false}, - {" chance anyways.", PCONT, PModel(), false, false}, + {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false}, + {" uncommon in Left-to-Right", PCONT, PModel(), false, false}, + {" languages, but they do", PCONT, PModel(), false, false}, + {" exist.", PCONT, PModel(), false, false}, + {" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false}, + {" horribly tiny paragraphs in", PCONT, PModel(), false, false}, + {" tables on which we have no", PCONT, PModel(), false, false}, + {" chance anyways.", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestRightAlignedParagraph) { @@ -350,21 +340,21 @@ TEST(ParagraphsTest, TestRightAlignedParagraph) { } const TextAndModel kTinyParagraphs[] = { - {" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"obvious paragraph text, you might", PCONT, PModel(), false, false}, - {"find short exchanges of dialogue ", PCONT, PModel(), false, false}, - {"between characters. ", PCONT, PModel(), false, false}, - {" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"mark a new paragraph whenever one", PCONT, PModel(), false, false}, - {"of the statistics (left, right or", PCONT, PModel(), false, false}, - {"center) changes from one text-", PCONT, PModel(), false, false}, - {"line to the next. Such an", PCONT, PModel(), false, false}, - {"approach would misclassify the", PCONT, PModel(), false, false}, - {"tiny paragraphs above as a single", PCONT, PModel(), false, false}, - {"paragraph. ", PCONT, PModel(), false, false}, + {" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"obvious paragraph text, you might", PCONT, PModel(), false, false}, + {"find short exchanges of dialogue ", PCONT, PModel(), false, false}, + {"between characters. ", PCONT, PModel(), false, false}, + {" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"mark a new paragraph whenever one", PCONT, PModel(), false, false}, + {"of the statistics (left, right or", PCONT, PModel(), false, false}, + {"center) changes from one text-", PCONT, PModel(), false, false}, + {"line to the next. Such an", PCONT, PModel(), false, false}, + {"approach would misclassify the", PCONT, PModel(), false, false}, + {"tiny paragraphs above as a single", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestTinyParagraphs) { @@ -372,49 +362,43 @@ TEST(ParagraphsTest, TestTinyParagraphs) { } const TextAndModel kComplexPage1[] = { - {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false}, - {" Centered Title ", PCONT, PModel(), false, false}, - {" Paragraph Detection ", PCONT, PModel(), false, false}, - {" OCR TEAM ", PCONT, PModel(), false, false}, - {" 10 November 2010 ", PCONT, PModel(), false, false}, - {" ", PNONE, PModel(), false, false}, - {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"This paragraph starts at the top", PCONT, PModel(), false, false}, - {"of the page and takes 3 lines. ", PCONT, PModel(), false, false}, - {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"which indicates that the first ", PCONT, PModel(), false, false}, - {"paragraph is not a continuation ", PCONT, PModel(), false, false}, - {"from a previous page, as it is ", PCONT, PModel(), false, false}, - {"indented just like this second ", PCONT, PModel(), false, false}, - {"paragraph. ", PCONT, PModel(), false, false}, - {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), - true, false}, - {" looks like the prior text ", PCONT, PModel(), false, false}, - {" but it is indented more ", PCONT, PModel(), false, false}, - {" and is fully justified. ", PCONT, PModel(), false, false}, - {" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"centered text, block quotes, ", PCONT, PModel(), false, false}, - {"normal paragraphs, and lists ", PCONT, PModel(), false, false}, - {"like what follows? ", PCONT, PModel(), false, false}, - {"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" looking for lines where the ", PCONT, PModel(), false, false}, - {" first word of the next line ", PCONT, PModel(), false, false}, - {" would fit on the previous ", PCONT, PModel(), false, false}, - {" line. ", PCONT, PModel(), false, false}, - {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" Python and try it out. ", PCONT, PModel(), false, false}, - {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" mistakes. ", PCONT, PModel(), false, false}, - {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"you can try to identify source ", PCONT, PModel(), false, false}, - {"code. Ouch! ", PCONT, PModel(), false, false}, + {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false}, + {" Centered Title ", PCONT, PModel(), false, false}, + {" Paragraph Detection ", PCONT, PModel(), false, false}, + {" OCR TEAM ", PCONT, PModel(), false, false}, + {" 10 November 2010 ", PCONT, PModel(), false, false}, + {" ", PNONE, PModel(), false, false}, + {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"This paragraph starts at the top", PCONT, PModel(), false, false}, + {"of the page and takes 3 lines. ", PCONT, PModel(), false, false}, + {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is not a continuation ", PCONT, PModel(), false, false}, + {"from a previous page, as it is ", PCONT, PModel(), false, false}, + {"indented just like this second ", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, + {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false}, + {" looks like the prior text ", PCONT, PModel(), false, false}, + {" but it is indented more ", PCONT, PModel(), false, false}, + {" and is fully justified. ", PCONT, PModel(), false, false}, + {" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"centered text, block quotes, ", PCONT, PModel(), false, false}, + {"normal paragraphs, and lists ", PCONT, PModel(), false, false}, + {"like what follows? ", PCONT, PModel(), false, false}, + {"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" looking for lines where the ", PCONT, PModel(), false, false}, + {" first word of the next line ", PCONT, PModel(), false, false}, + {" would fit on the previous ", PCONT, PModel(), false, false}, + {" line. ", PCONT, PModel(), false, false}, + {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" Python and try it out. ", PCONT, PModel(), false, false}, + {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" mistakes. ", PCONT, PModel(), false, false}, + {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"you can try to identify source ", PCONT, PModel(), false, false}, + {"code. Ouch! ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestComplexPage1) { @@ -423,47 +407,41 @@ TEST(ParagraphsTest, TestComplexPage1) { // The same as above, but wider. const TextAndModel kComplexPage2[] = { - {" Awesome ", PSTART, - PModel(kCenter, 0, 0, 0, 0), false, false}, - {" Centered Title ", PCONT, PModel(), false, false}, - {" Paragraph Detection ", PCONT, PModel(), false, false}, - {" OCR TEAM ", PCONT, PModel(), false, false}, - {" 10 November 2010 ", PCONT, PModel(), false, false}, - {" ", PNONE, PModel(), false, false}, - {" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"This paragraph starts at the top of", PCONT, PModel(), false, false}, - {"the page and takes 3 lines. ", PCONT, PModel(), false, false}, - {" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"which indicates that the first ", PCONT, PModel(), false, false}, - {"paragraph is not a continuation ", PCONT, PModel(), false, false}, - {"from a previous page, as it is in- ", PCONT, PModel(), false, false}, - {"dented just like this second para- ", PCONT, PModel(), false, false}, - {"graph. ", PCONT, PModel(), false, false}, - {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), - true, false}, - {" looks like the prior text ", PCONT, PModel(), false, false}, - {" but it is indented more ", PCONT, PModel(), false, false}, - {" and is fully justified. ", PCONT, PModel(), false, false}, - {" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"ed text, block quotes, normal para-", PCONT, PModel(), false, false}, - {"graphs, and lists like what follow?", PCONT, PModel(), false, false}, - {"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!! - {"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" looking for lines where the ", PCONT, PModel(), false, false}, - {" first word of the next line ", PCONT, PModel(), false, false}, - {" would fit on the previous line. ", PCONT, PModel(), false, false}, - {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" Python and try it out. ", PCONT, PModel(), false, false}, - {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" mistakes. ", PCONT, PModel(), false, false}, - {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), - false, true}, - {" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"you can try to identify source ", PCONT, PModel(), false, false}, - {"code. Ouch! ", PCONT, PModel(), false, false}, + {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false}, + {" Centered Title ", PCONT, PModel(), false, false}, + {" Paragraph Detection ", PCONT, PModel(), false, false}, + {" OCR TEAM ", PCONT, PModel(), false, false}, + {" 10 November 2010 ", PCONT, PModel(), false, false}, + {" ", PNONE, PModel(), false, false}, + {" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"This paragraph starts at the top of", PCONT, PModel(), false, false}, + {"the page and takes 3 lines. ", PCONT, PModel(), false, false}, + {" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is not a continuation ", PCONT, PModel(), false, false}, + {"from a previous page, as it is in- ", PCONT, PModel(), false, false}, + {"dented just like this second para- ", PCONT, PModel(), false, false}, + {"graph. ", PCONT, PModel(), false, false}, + {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false}, + {" looks like the prior text ", PCONT, PModel(), false, false}, + {" but it is indented more ", PCONT, PModel(), false, false}, + {" and is fully justified. ", PCONT, PModel(), false, false}, + {" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"ed text, block quotes, normal para-", PCONT, PModel(), false, false}, + {"graphs, and lists like what follow?", PCONT, PModel(), false, false}, + {"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!! + {"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" looking for lines where the ", PCONT, PModel(), false, false}, + {" first word of the next line ", PCONT, PModel(), false, false}, + {" would fit on the previous line. ", PCONT, PModel(), false, false}, + {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" Python and try it out. ", PCONT, PModel(), false, false}, + {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" mistakes. ", PCONT, PModel(), false, false}, + {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true}, + {" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"you can try to identify source ", PCONT, PModel(), false, false}, + {"code. Ouch! ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, TestComplexPage2) { @@ -471,15 +449,14 @@ TEST(ParagraphsTest, TestComplexPage2) { } const TextAndModel kSubtleCrown[] = { - {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), - true, false}, - {"often not indented as the rest ", PCONT, PModel(), false, false}, - {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false}, - {"less it should be counted as the", PCONT, PModel(), false, false}, - {"same type of paragraph. ", PCONT, PModel(), false, false}, - {" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, - {"should suffice. ", PCONT, PModel(), false, false}, - {" 1235 ", PNONE, PModel(), false, false}, + {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false}, + {"often not indented as the rest ", PCONT, PModel(), false, false}, + {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false}, + {"less it should be counted as the", PCONT, PModel(), false, false}, + {"same type of paragraph. ", PCONT, PModel(), false, false}, + {" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"should suffice. ", PCONT, PModel(), false, false}, + {" 1235 ", PNONE, PModel(), false, false}, }; TEST(ParagraphsTest, TestSubtleCrown) { @@ -491,43 +468,72 @@ TEST(ParagraphsTest, TestStrayLineInBlock) { } const TextAndModel kUnlvRep3AO[] = { - {" Defined contribution plans cover employees in Australia, New", PSTART, - PModel(kLeft, 0, 50, 0, 0), false, false}, - {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, false}, - {"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), false, false}, - {"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false}, - {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false}, - {"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, false}, - {"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, false}, - {"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, false}, - {" In addition to providing pension benefits, the Company pro- ", PSTART, - PModel(kLeft, 0, 50, 0, 0), false, false}, - {"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, false}, - {"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, false}, - {"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, false}, - {"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, false}, - {"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, false}, - {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false}, - {"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, false}, - {" The U.S. plan covering the parent company is the largest plan.", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, false}, - {"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, false}, - {"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, false}, - {"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), false, false}, - {"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, false}, - {"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, false}, - {"credited service. The Company has the ability to change these ", PCONT, PModel(), false, false}, - {"benefits at any time. ", PCONT, PModel(), false, false}, - {" Effective October 1993, the Company amended its health ", PSTART, - PModel(kLeft, 0, 50, 0, 0), false, false}, - {"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false}, - {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false}, - {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false}, - {"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, false}, - {"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false}, - {"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, false}, - {"for 1994 by approximately $83. ", PCONT, PModel(), false, false}, + {" Defined contribution plans cover employees in Australia, New", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, + false}, + {"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), + false, false}, + {"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, + false}, + {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, + false}, + {"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, + false}, + {"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, + false}, + {"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, + false}, + {" In addition to providing pension benefits, the Company pro- ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, + false}, + {"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, + false}, + {"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, + false}, + {"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, + false}, + {"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, + false}, + {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, + false}, + {"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, + false}, + {" The U.S. plan covering the parent company is the largest plan.", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, + false}, + {"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, + false}, + {"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, + false}, + {"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), + false, false}, + {"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, + false}, + {"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, + false}, + {"credited service. The Company has the ability to change these ", PCONT, PModel(), false, + false}, + {"benefits at any time. ", PCONT, PModel(), false, + false}, + {" Effective October 1993, the Company amended its health ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, + false}, + {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, + false}, + {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, + false}, + {"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, + false}, + {"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, + false}, + {"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, + false}, + {"for 1994 by approximately $83. ", PCONT, PModel(), false, + false}, }; TEST(ParagraphsTest, TestUnlvInsurance) { @@ -540,19 +546,19 @@ TEST(ParagraphsTest, TestUnlvInsurance) { // paragraph or two. // This example comes from Volume 9886293, Page 5 const TextAndModel kTableOfContents[] = { - {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, - {" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, }; TEST(ParagraphsTest, TestSplitsOutLeaderLines) { @@ -560,141 +566,156 @@ TEST(ParagraphsTest, TestSplitsOutLeaderLines) { } const TextAndModel kTextWithSourceCode[] = { - {" A typical page of a programming book may contain", PSTART, - PModel(kLeft, 0, 20, 0, 0), false, false}, - {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false}, - {"being described in prose. Such examples should be", PCONT, PModel(), false, false}, - {"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false}, - {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false}, - {"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false}, - {"source code would lead to a bad reading experience", PCONT, PModel(), false, false}, - {"when the text is re-flowed. ", PCONT, PModel(), false, false}, - {" Let's show this by describing the function fact-", PSTART, - PModel(kLeft, 0, 20, 0, 0), false, false}, - {"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false}, - {"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false}, - {"that the typical C implementation will only work ", PCONT, PModel(), false, false}, - {"for values less than about 12: ", PCONT, PModel(), false, false}, - {" ", PNONE, PModel(), false, false}, - {" # Naive implementation in C ", PCONT, PModel(), false, false}, - {" int factorial(int n) { ", PCONT, PModel(), false, false}, - {" if (n < 2) ", PCONT, PModel(), false, false}, - {" return 1; ", PCONT, PModel(), false, false}, - {" return n * factorial(n - 1); ", PCONT, PModel(), false, false}, - {" } ", PCONT, PModel(), false, false}, - {" ", PCONT, PModel(), false, false}, - {" The C programming language does not have built- ", PSTART, - PModel(kLeft, 0, 20, 0, 0), false, false}, - {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false}, - {"naive implementation simply returns random values ", PCONT, PModel(), false, false}, - {"if even a moderate sized n is provided. ", PCONT, PModel(), false, false}, + {" A typical page of a programming book may contain", PSTART, PModel(kLeft, 0, 20, 0, 0), + false, false}, + {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false}, + {"being described in prose. Such examples should be", PCONT, PModel(), false, false}, + {"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false}, + {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false}, + {"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false}, + {"source code would lead to a bad reading experience", PCONT, PModel(), false, false}, + {"when the text is re-flowed. ", PCONT, PModel(), false, false}, + {" Let's show this by describing the function fact-", PSTART, PModel(kLeft, 0, 20, 0, 0), + false, false}, + {"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false}, + {"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false}, + {"that the typical C implementation will only work ", PCONT, PModel(), false, false}, + {"for values less than about 12: ", PCONT, PModel(), false, false}, + {" ", PNONE, PModel(), false, false}, + {" # Naive implementation in C ", PCONT, PModel(), false, false}, + {" int factorial(int n) { ", PCONT, PModel(), false, false}, + {" if (n < 2) ", PCONT, PModel(), false, false}, + {" return 1; ", PCONT, PModel(), false, false}, + {" return n * factorial(n - 1); ", PCONT, PModel(), false, false}, + {" } ", PCONT, PModel(), false, false}, + {" ", PCONT, PModel(), false, false}, + {" The C programming language does not have built- ", PSTART, PModel(kLeft, 0, 20, 0, 0), + false, false}, + {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false}, + {"naive implementation simply returns random values ", PCONT, PModel(), false, false}, + {"if even a moderate sized n is provided. ", PCONT, PModel(), false, false}, }; TEST(ParagraphsTest, NotDistractedBySourceCode) { - TestParagraphDetection(kTextWithSourceCode, - countof(kTextWithSourceCode)); + TestParagraphDetection(kTextWithSourceCode, countof(kTextWithSourceCode)); } const TextAndModel kOldManAndSea[] = { - {"royal palm which are called guano and in it there was a bed, a", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false}, - {"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), false, false}, - {"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), false, false}, - {"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), false, false}, - {"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), false, false}, - {"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), false, false}, - {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false}, - {"shirt. ", PCONT, PModel(), false, false}, - {" \"What do you have to eat?\" the boy asked. ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"A pot of yellow rice with fish. Do you want some?\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"No. I will eat at home. Do you want me to make the fire?\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"No. I will make it later on. Or I may eat the rice cold.\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"May I take the cast net?\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"Of course.\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" There was no cast net and the boy remembered when they had", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), false, false}, - {"pot of yellow rice and fish and the boy knew this too. " - " ", PCONT, PModel(), false, false}, - {" \"Eighty-five is a lucky number,\" the old man said. \"How", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"would you like to see me bring one in that dressed out over a " - "thou-", PCONT, PModel(), false, false}, - {"sand pounds? " - " ", PCONT, PModel(), false, false}, - {" \"I'll get the cast net and go for sardines. Will you sit in the " - "sun", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"in the doorway?\" " - " ", PCONT, PModel(), false, false}, - {" \"Yes. I have yesterday's paper and I will read the baseball.\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" The boy did not know whether yesterday's paper was a fiction", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"too. But the old man brought it out from under the bed. ", PCONT, PModel(), false, false}, - {" \"Pedrico gave it to me at the bodega,\" he explained. " - " ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"I'll be back when I have the sardines. I'll keep yours and mine", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"together on ice and we can share them in the morning. When I", PCONT, PModel(), false, false}, - {"come back you can tell me about the baseball.\" ", PCONT, PModel(), false, false}, - {" \"The Yankees cannot lose.\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"But I fear the Indians of Cleveland.\" ", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {" \"Have faith in the Yankees my son. Think of the great Di-", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"Maggio.\" ", PCONT, PModel(), false, false}, - {" \"I fear both the Tigers of Detroit and the Indians of Cleve-", - PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, - {"land.\" ", PCONT, PModel(), false, false} -}; + {"royal palm which are called guano and in it there was a bed, a", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), + false, false}, + {"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), + false, false}, + {"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), + false, false}, + {"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), + false, false}, + {"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), + false, false}, + {"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), + false, false}, + {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), + false, false}, + {"shirt. ", PCONT, PModel(), + false, false}, + {" \"What do you have to eat?\" the boy asked. ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"A pot of yellow rice with fish. Do you want some?\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"No. I will eat at home. Do you want me to make the fire?\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"No. I will make it later on. Or I may eat the rice cold.\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"May I take the cast net?\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"Of course.\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" There was no cast net and the boy remembered when they had", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), + false, false}, + {"pot of yellow rice and fish and the boy knew this too. " + " ", + PCONT, PModel(), false, false}, + {" \"Eighty-five is a lucky number,\" the old man said. \"How", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"would you like to see me bring one in that dressed out over a " + "thou-", + PCONT, PModel(), false, false}, + {"sand pounds? " + " ", + PCONT, PModel(), false, false}, + {" \"I'll get the cast net and go for sardines. Will you sit in the " + "sun", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"in the doorway?\" " + " ", + PCONT, PModel(), false, false}, + {" \"Yes. I have yesterday's paper and I will read the baseball.\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" The boy did not know whether yesterday's paper was a fiction", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"too. But the old man brought it out from under the bed. ", PCONT, PModel(), + false, false}, + {" \"Pedrico gave it to me at the bodega,\" he explained. " + " ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"I'll be back when I have the sardines. I'll keep yours and mine", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"together on ice and we can share them in the morning. When I", PCONT, PModel(), + false, false}, + {"come back you can tell me about the baseball.\" ", PCONT, PModel(), + false, false}, + {" \"The Yankees cannot lose.\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"But I fear the Indians of Cleveland.\" ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"Have faith in the Yankees my son. Think of the great Di-", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"Maggio.\" ", PCONT, PModel(), + false, false}, + {" \"I fear both the Tigers of Detroit and the Indians of Cleve-", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"land.\" ", PCONT, PModel(), + false, false}}; TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) { TestParagraphDetection(kOldManAndSea, countof(kOldManAndSea)); } const TextAndModel kNewZealandIndex[] = { - {"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {" 138 ", PCONT, PModel(), false, false}, - {"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {" 145 ", PCONT, PModel(), false, false}, - {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {" 85 ", PCONT, PModel(), false, false}, - {"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, - {"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false} -}; + {"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {" 138 ", PCONT, PModel(), false, false}, + {"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {" 145 ", PCONT, PModel(), false, false}, + {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {" 85 ", PCONT, PModel(), false, false}, + {"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}}; TEST(ParagraphsTest, IndexPageTest) { TestParagraphDetection(kNewZealandIndex, countof(kNewZealandIndex)); @@ -702,4 +723,4 @@ TEST(ParagraphsTest, IndexPageTest) { // TODO(eger): Add some right-to-left examples, and fix the algorithm as needed. -} // namespace +} // namespace tesseract diff --git a/unittest/params_model_test.cc b/unittest/params_model_test.cc index 8627ab8e..3cce45b5 100644 --- a/unittest/params_model_test.cc +++ b/unittest/params_model_test.cc @@ -9,20 +9,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // std::string +#include // std::string #include #include "include_gunit.h" #include "params_model.h" -#include "serialis.h" // TFile -#include "tprintf.h" // tprintf +#include "serialis.h" // TFile +#include "tprintf.h" // tprintf namespace tesseract { // Test some basic I/O of params model files (automated learning of language // model weights). #ifndef DISABLED_LEGACY_ENGINE -static bool LoadFromFile(tesseract::ParamsModel& model, const char* lang, const char* full_path) { +static bool LoadFromFile(tesseract::ParamsModel &model, const char *lang, const char *full_path) { tesseract::TFile fp; if (!fp.Open(full_path, nullptr)) { tprintf("Error opening file %s\n", full_path); @@ -34,20 +34,20 @@ static bool LoadFromFile(tesseract::ParamsModel& model, const char* lang, const class ParamsModelTest : public testing::Test { #ifndef DISABLED_LEGACY_ENGINE - protected: +protected: void SetUp() override { std::locale::global(std::locale("")); } - std::string TestDataNameToPath(const std::string& name) const { + std::string TestDataNameToPath(const std::string &name) const { return file::JoinPath(TESTDATA_DIR, name); } - std::string OutputNameToPath(const std::string& name) const { + std::string OutputNameToPath(const std::string &name) const { return file::JoinPath(FLAGS_test_tmpdir, name); } // Test that we are able to load a params model, save it, reload it, // and verify that the re-serialized version is the same as the original. - void TestParamsModelRoundTrip(const std::string& params_model_filename) const { + void TestParamsModelRoundTrip(const std::string ¶ms_model_filename) const { tesseract::ParamsModel orig_model; tesseract::ParamsModel duplicate_model; file::MakeTmpdir(); @@ -72,4 +72,4 @@ TEST_F(ParamsModelTest, TestEngParamsModelIO) { #endif } -} // namespace +} // namespace tesseract diff --git a/unittest/progress_test.cc b/unittest/progress_test.cc index 0444eccd..8e41630b 100644 --- a/unittest/progress_test.cc +++ b/unittest/progress_test.cc @@ -27,7 +27,7 @@ #include #include #include -#include // std::unique_ptr +#include // std::unique_ptr #include #include @@ -35,19 +35,20 @@ namespace tesseract { class QuickTest : public testing::Test { - protected: - virtual void SetUp() { start_time_ = time(nullptr); } +protected: + virtual void SetUp() { + start_time_ = time(nullptr); + } virtual void TearDown() { const time_t end_time = time(nullptr); EXPECT_TRUE(end_time - start_time_ <= 25) - << "The test took too long - " - << ::testing::PrintToString(end_time - start_time_); + << "The test took too long - " << ::testing::PrintToString(end_time - start_time_); } time_t start_time_; }; class ClassicMockProgressSink { - public: +public: MOCK_METHOD1(classicProgress, bool(int)); MOCK_METHOD1(cancel, bool(int)); @@ -57,32 +58,30 @@ class ClassicMockProgressSink { monitor.progress_callback = [](int progress, int, int, int, int) -> bool { return instance->classicProgress(progress); }; - monitor.cancel = [](void* ths, int words) -> bool { - return ((ClassicMockProgressSink*)ths)->cancel(words); + monitor.cancel = [](void *ths, int words) -> bool { + return ((ClassicMockProgressSink *)ths)->cancel(words); }; monitor.cancel_this = this; instance = this; } - static ClassicMockProgressSink* instance; + static ClassicMockProgressSink *instance; }; -ClassicMockProgressSink* ClassicMockProgressSink::instance = nullptr; +ClassicMockProgressSink *ClassicMockProgressSink::instance = nullptr; class NewMockProgressSink : public ClassicMockProgressSink { - public: +public: MOCK_METHOD1(progress, bool(int)); NewMockProgressSink() { - monitor.progress_callback2 = [](ETEXT_DESC* ths, int, int, int, - int) -> bool { - return ((NewMockProgressSink*)ths->cancel_this)->progress(ths->progress); + monitor.progress_callback2 = [](ETEXT_DESC *ths, int, int, int, int) -> bool { + return ((NewMockProgressSink *)ths->cancel_this)->progress(ths->progress); }; } }; -void ClassicProgressTester(const char* imgname, const char* tessdatadir, - const char* lang) { +void ClassicProgressTester(const char *imgname, const char *tessdatadir, const char *lang) { using ::testing::_; using ::testing::AllOf; using ::testing::AtLeast; @@ -93,22 +92,18 @@ void ClassicProgressTester(const char* imgname, const char* tessdatadir, using ::testing::SaveArg; std::unique_ptr api(new tesseract::TessBaseAPI()); - ASSERT_FALSE(api->Init(tessdatadir, lang)) - << "Could not initialize tesseract."; - Pix* image = pixRead(imgname); + ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract."; + Pix *image = pixRead(imgname); ASSERT_TRUE(image != nullptr) << "Failed to read test image."; api->SetImage(image); ClassicMockProgressSink progressSink; int currentProgress = -1; - EXPECT_CALL(progressSink, - classicProgress(AllOf(Gt(currentProgress), Le(100)))) + EXPECT_CALL(progressSink, classicProgress(AllOf(Gt(currentProgress), Le(100)))) .Times(AtLeast(5)) .WillRepeatedly(DoAll(SaveArg<0>(¤tProgress), Return(false))); - EXPECT_CALL(progressSink, cancel(_)) - .Times(AtLeast(5)) - .WillRepeatedly(Return(false)); + EXPECT_CALL(progressSink, cancel(_)).Times(AtLeast(5)).WillRepeatedly(Return(false)); EXPECT_EQ(api->Recognize(&progressSink.monitor), false); EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%"; @@ -117,8 +112,7 @@ void ClassicProgressTester(const char* imgname, const char* tessdatadir, pixDestroy(&image); } -void NewProgressTester(const char* imgname, const char* tessdatadir, - const char* lang) { +void NewProgressTester(const char *imgname, const char *tessdatadir, const char *lang) { using ::testing::_; using ::testing::AllOf; using ::testing::AtLeast; @@ -129,9 +123,8 @@ void NewProgressTester(const char* imgname, const char* tessdatadir, using ::testing::SaveArg; std::unique_ptr api(new tesseract::TessBaseAPI()); - ASSERT_FALSE(api->Init(tessdatadir, lang)) - << "Could not initialize tesseract."; - Pix* image = pixRead(imgname); + ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract."; + Pix *image = pixRead(imgname); ASSERT_TRUE(image != nullptr) << "Failed to read test image."; api->SetImage(image); @@ -139,12 +132,10 @@ void NewProgressTester(const char* imgname, const char* tessdatadir, int currentProgress = -1; EXPECT_CALL(progressSink, classicProgress(_)).Times(0); - EXPECT_CALL(progressSink, progress(AllOf(Gt(currentProgress), Le(100)))) + EXPECT_CALL(progressSink, progress(AllOf(Gt(currentProgress), Le(100)))) .Times(AtLeast(5)) .WillRepeatedly(DoAll(SaveArg<0>(¤tProgress), Return(false))); - EXPECT_CALL(progressSink, cancel(_)) - .Times(AtLeast(5)) - .WillRepeatedly(Return(false)); + EXPECT_CALL(progressSink, cancel(_)).Times(AtLeast(5)).WillRepeatedly(Return(false)); EXPECT_EQ(api->Recognize(&progressSink.monitor), false); EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%"; @@ -154,12 +145,11 @@ void NewProgressTester(const char* imgname, const char* tessdatadir, } TEST(QuickTest, ClassicProgressReporting) { - ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", - "eng"); + ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng"); } TEST(QuickTest, NewProgressReporting) { NewProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng"); } -} // namespace +} // namespace tesseract diff --git a/unittest/qrsequence_test.cc b/unittest/qrsequence_test.cc index 783228d8..0757ba95 100644 --- a/unittest/qrsequence_test.cc +++ b/unittest/qrsequence_test.cc @@ -9,7 +9,6 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include #include @@ -21,8 +20,8 @@ namespace tesseract { class TestableQRSequenceGenerator : public QRSequenceGenerator { - public: - explicit TestableQRSequenceGenerator(const int& N) : QRSequenceGenerator(N) {} +public: + explicit TestableQRSequenceGenerator(const int &N) : QRSequenceGenerator(N) {} // Overriding scope for testing using QRSequenceGenerator::GetBinaryReversedInteger; }; @@ -38,7 +37,7 @@ TEST(QRSequenceGenerator, GetBinaryReversedInteger) { // Trivial test fixture for a parameterized test. class QRSequenceGeneratorTest : public ::testing::TestWithParam { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } @@ -50,7 +49,8 @@ TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) { std::vector vals(kRangeSize); CycleTimer timer; timer.Restart(); - for (int i = 0; i < kRangeSize; ++i) vals[i] = generator.GetVal(); + for (int i = 0; i < kRangeSize; ++i) + vals[i] = generator.GetVal(); LOG(INFO) << kRangeSize << "-length sequence took " << timer.GetInMs() << "ms"; // Sort the numbers to verify that we've covered the range without repetition. std::sort(vals.begin(), vals.end()); @@ -65,5 +65,5 @@ TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) { // Run a parameterized test using the following range sizes. INSTANTIATE_TEST_SUITE_P(RangeTest, QRSequenceGeneratorTest, - ::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6)); -} // namespace + ::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6)); +} // namespace tesseract diff --git a/unittest/recodebeam_test.cc b/unittest/recodebeam_test.cc index 6e9bc4e3..6816aa64 100644 --- a/unittest/recodebeam_test.cc +++ b/unittest/recodebeam_test.cc @@ -9,22 +9,21 @@ // See the License for the specific language governing permissions and // limitations under the License. - #include "include_gunit.h" -#include "log.h" // for LOG +#include "log.h" // for LOG #include "genericvector.h" -#include "recodebeam.h" #include "matrix.h" +#include "normstrngs.h" #include "pageres.h" #include "ratngs.h" +#include "recodebeam.h" #include "unicharcompress.h" -#include "normstrngs.h" #include "unicharset_training_utils.h" #include "helpers.h" -#include "absl/strings/str_format.h" // for absl::StrFormat +#include "absl/strings/str_format.h" // for absl::StrFormat namespace tesseract { @@ -40,53 +39,48 @@ const int kPadding = 64; // f stronger than t in "Get". // weak space between Gef and s and between s and words. // weak space between words and right. -const char* kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d", +const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d", "s", "", "r", "i", "g", "h", "t", ".", nullptr}; -const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, - 0.89, 0.99, 0.99, 0.99, 0.99, 0.95, - 0.99, 0.90, 0.90, 0.90, 0.95, 0.75}; -const char* kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h", +const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99, + 0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75}; +const char *kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h", "S", " ", "t", "I", "9", "b", "f", ",", nullptr}; -const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, - 0.10, 0.01, 0.01, 0.01, 0.01, 0.05, - 0.01, 0.09, 0.09, 0.09, 0.05, 0.25}; +const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01, + 0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25}; -const char* kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr}; +const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr}; const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98}; -const char* kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr}; +const char *kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr}; const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01}; -const char* kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr}; +const char *kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr}; const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97}; -const char* kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr}; +const char *kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr}; const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01}; class RecodeBeamTest : public ::testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); } RecodeBeamTest() : lstm_dict_(&ccutil_) {} - ~RecodeBeamTest() { lstm_dict_.End(); } + ~RecodeBeamTest() { + lstm_dict_.End(); + } // Loads and compresses the given unicharset. - void LoadUnicharset(const std::string& unicharset_name) { - std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, - "radical-stroke.txt"); - std::string unicharset_file = - file::JoinPath(TESTDATA_DIR, unicharset_name); + void LoadUnicharset(const std::string &unicharset_name) { + std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt"); + std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name); std::string radical_data; - CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, - file::Defaults())); + CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults())); CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str())); - unichar_null_char_ = ccutil_.unicharset.has_special_codes() - ? UNICHAR_BROKEN - : ccutil_.unicharset.size(); + unichar_null_char_ = + ccutil_.unicharset.has_special_codes() ? UNICHAR_BROKEN : ccutil_.unicharset.size(); STRING radical_str(radical_data.c_str()); - EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, - &radical_str)); + EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str)); RecodedCharID code; recoder_.EncodeUnichar(unichar_null_char_, &code); encoded_null_char_ = code(0); @@ -100,10 +94,9 @@ class RecodeBeamTest : public ::testing::Test { LOG(INFO) << "Wrote encoding to:" << output_name << "\n"; } // Loads the dictionary. - void LoadDict(const std::string& lang) { + void LoadDict(const std::string &lang) { std::string traineddata_name = lang + ".traineddata"; - std::string traineddata_file = - file::JoinPath(TESTDATA_DIR, traineddata_name); + std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name); lstm_dict_.SetupForLoad(nullptr); tesseract::TessdataManager mgr; mgr.Init(traineddata_file.c_str()); @@ -112,8 +105,8 @@ class RecodeBeamTest : public ::testing::Test { } // Expects the appropriate results from the compressed_ ccutil_.unicharset. - void ExpectCorrect(const GENERIC_2D_ARRAY& output, - const GenericVector& transcription) { + void ExpectCorrect(const GENERIC_2D_ARRAY &output, + const GenericVector &transcription) { // Get the utf8 string of the transcription. std::string truth_utf8; for (int i = 0; i < transcription.size(); ++i) { @@ -122,17 +115,15 @@ class RecodeBeamTest : public ::testing::Test { PointerVector words; ExpectCorrect(output, truth_utf8, nullptr, &words); } - void ExpectCorrect(const GENERIC_2D_ARRAY& output, - const std::string& truth_utf8, Dict* dict, - PointerVector* words) { + void ExpectCorrect(const GENERIC_2D_ARRAY &output, const std::string &truth_utf8, + Dict *dict, PointerVector *words) { RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict); beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr); // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug: // beam_search.DebugBeams(ccutil_.unicharset); std::vector labels, xcoords; beam_search.ExtractBestPathAsLabels(&labels, &xcoords); - LOG(INFO) << "Labels size = " << labels.size() << " coords " - << xcoords.size() << "\n"; + LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n"; // Now decode using recoder_. std::string decoded; int end = 1; @@ -143,12 +134,9 @@ class RecodeBeamTest : public ::testing::Test { do { code.Set(code.length(), labels[index++]); uni_id = recoder_.DecodeUnichar(code); - } while (index < labels.size() && - code.length() < RecodedCharID::kMaxCodeLen && - (uni_id == INVALID_UNICHAR_ID || - !recoder_.IsValidFirstCode(labels[index]))); - EXPECT_NE(INVALID_UNICHAR_ID, uni_id) - << "index=" << index << "/" << labels.size(); + } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen && + (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index]))); + EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size(); // To the extent of truth_utf8, we expect decoded to match, but if // transcription is shorter, that is OK too, as we may just be testing // that we get a valid sequence when padded with random data. @@ -161,8 +149,7 @@ class RecodeBeamTest : public ::testing::Test { // Check that ExtractBestPathAsUnicharIds does the same thing. std::vector unichar_ids; std::vector certainties, ratings; - beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, - &unichar_ids, &certainties, + beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties, &ratings, &xcoords); std::string u_decoded; float total_rating = 0.0f; @@ -171,12 +158,13 @@ class RecodeBeamTest : public ::testing::Test { // transcription is shorter, that is OK too, as we may just be testing // that we get a valid sequence when padded with random data. if (u_decoded.size() < truth_utf8.size()) { - const char* str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]); + const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]); total_rating += ratings[u]; - LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u, - unichar_ids[u], str, certainties[u], - ratings[u], total_rating, xcoords[u]) << "\n"; - if (str[0] == ' ') total_rating = 0.0f; + LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u, unichar_ids[u], + str, certainties[u], ratings[u], total_rating, xcoords[u]) + << "\n"; + if (str[0] == ' ') + total_rating = 0.0f; u_decoded += str; } } @@ -185,20 +173,20 @@ class RecodeBeamTest : public ::testing::Test { // Check that ExtractBestPathAsWords does the same thing. TBOX line_box(0, 0, 100, 10); for (int i = 0; i < 2; ++i) { - beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, - &ccutil_.unicharset, words); + beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words); std::string w_decoded; for (int w = 0; w < words->size(); ++w) { - const WERD_RES* word = (*words)[w]; + const WERD_RES *word = (*words)[w]; if (w_decoded.size() < truth_utf8.size()) { - if (!w_decoded.empty() && word->word->space()) w_decoded += " "; + if (!w_decoded.empty() && word->word->space()) + w_decoded += " "; w_decoded += word->best_choice->unichar_string().c_str(); } LOG(INFO) << absl::StrFormat("Word:%d = %s, c=%g, r=%g, perm=%d", w, - word->best_choice->unichar_string().c_str(), - word->best_choice->certainty(), - word->best_choice->rating(), - word->best_choice->permuter()) << "\n"; + word->best_choice->unichar_string().c_str(), + word->best_choice->certainty(), word->best_choice->rating(), + word->best_choice->permuter()) + << "\n"; } std::string w_trunc(w_decoded.data(), truth_utf8.size()); if (truth_utf8 != w_trunc) { @@ -212,8 +200,8 @@ class RecodeBeamTest : public ::testing::Test { } // Generates easy encoding of the given unichar_ids, and pads with at least // padding of random data. - GENERIC_2D_ARRAY GenerateRandomPaddedOutputs( - const GenericVector& unichar_ids, int padding) { + GENERIC_2D_ARRAY GenerateRandomPaddedOutputs(const GenericVector &unichar_ids, + int padding) { int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen; int num_codes = recoder_.code_range(); GENERIC_2D_ARRAY outputs(width + padding, num_codes, 0.0f); @@ -242,20 +230,21 @@ class RecodeBeamTest : public ::testing::Test { // Normalize the probs. for (int t = 0; t < width; ++t) { double sum = 0.0; - for (int i = 0; i < num_codes; ++i) sum += outputs(t, i); - for (int i = 0; i < num_codes; ++i) outputs(t, i) /= sum; + for (int i = 0; i < num_codes; ++i) + sum += outputs(t, i); + for (int i = 0; i < num_codes; ++i) + outputs(t, i) /= sum; } return outputs; } // Encodes a utf8 string (character) as unichar_id, then recodes, and sets // the score for the appropriate sequence of codes, returning the ending t. - int EncodeUTF8(const char* utf8_str, float score, int start_t, TRand* random, - GENERIC_2D_ARRAY* outputs) { + int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, + GENERIC_2D_ARRAY *outputs) { int t = start_t; std::vector unichar_ids; - EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, - nullptr, nullptr)); + EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr)); if (unichar_ids.empty() || utf8_str[0] == '\0') { unichar_ids.clear(); unichar_ids.push_back(unichar_null_char_); @@ -268,8 +257,7 @@ class RecodeBeamTest : public ::testing::Test { for (int i = 0; i < len; ++i) { // Apply the desired score. (*outputs)(t++, code(i)) = score; - if (random != nullptr && - t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) { + if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) { int dups = static_cast(random->UnsignedRand(3.0)); for (int d = 0; d < dups; ++d) { // Duplicate the desired score. @@ -277,8 +265,7 @@ class RecodeBeamTest : public ::testing::Test { } } } - if (random != nullptr && - t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) { + if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) { int dups = static_cast(random->UnsignedRand(3.0)); for (int d = 0; d < dups; ++d) { // Add a random number of nulls as well. @@ -292,13 +279,12 @@ class RecodeBeamTest : public ::testing::Test { // uses scores1 for chars1 and scores2 for chars2, and everything else gets // the leftovers shared out equally. Note that empty string encodes as the // null_char_. - GENERIC_2D_ARRAY GenerateSyntheticOutputs(const char* chars1[], - const float scores1[], - const char* chars2[], - const float scores2[], - TRand* random) { + GENERIC_2D_ARRAY GenerateSyntheticOutputs(const char *chars1[], const float scores1[], + const char *chars2[], const float scores2[], + TRand *random) { int width = 0; - while (chars1[width] != nullptr) ++width; + while (chars1[width] != nullptr) + ++width; int padding = width * RecodedCharID::kMaxCodeLen; int num_codes = recoder_.code_range(); GENERIC_2D_ARRAY outputs(width + padding, num_codes, 0.0f); @@ -312,7 +298,8 @@ class RecodeBeamTest : public ::testing::Test { int max_t = std::max(end_t1, end_t2); while (t < max_t) { double total_score = 0.0; - for (int j = 0; j < num_codes; ++j) total_score += outputs(t, j); + for (int j = 0; j < num_codes; ++j) + total_score += outputs(t, j); double null_remainder = (1.0 - total_score) / 2.0; double remainder = null_remainder / (num_codes - 2); if (outputs(t, encoded_null_char_) < null_remainder) { @@ -321,7 +308,8 @@ class RecodeBeamTest : public ::testing::Test { remainder += remainder; } for (int j = 0; j < num_codes; ++j) { - if (outputs(t, j) == 0.0f) outputs(t, j) = remainder; + if (outputs(t, j) == 0.0f) + outputs(t, j) = remainder; } ++t; } @@ -340,16 +328,17 @@ class RecodeBeamTest : public ::testing::Test { }; TEST_F(RecodeBeamTest, DoesChinese) { - LOG(INFO) << "Testing chi_tra" << "\n"; + LOG(INFO) << "Testing chi_tra" + << "\n"; LoadUnicharset("chi_tra.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. GenericVector transcription; for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) transcription.push_back(i); - GENERIC_2D_ARRAY outputs = - GenerateRandomPaddedOutputs(transcription, kPadding); + GENERIC_2D_ARRAY outputs = GenerateRandomPaddedOutputs(transcription, kPadding); ExpectCorrect(outputs, transcription); - LOG(INFO) << "Testing chi_sim" << "\n"; + LOG(INFO) << "Testing chi_sim" + << "\n"; LoadUnicharset("chi_sim.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. transcription.clear(); @@ -360,72 +349,74 @@ TEST_F(RecodeBeamTest, DoesChinese) { } TEST_F(RecodeBeamTest, DoesJapanese) { - LOG(INFO) << "Testing jpn" << "\n"; + LOG(INFO) << "Testing jpn" + << "\n"; LoadUnicharset("jpn.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. GenericVector transcription; for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) transcription.push_back(i); - GENERIC_2D_ARRAY outputs = - GenerateRandomPaddedOutputs(transcription, kPadding); + GENERIC_2D_ARRAY outputs = GenerateRandomPaddedOutputs(transcription, kPadding); ExpectCorrect(outputs, transcription); } TEST_F(RecodeBeamTest, DoesKorean) { - LOG(INFO) << "Testing kor" << "\n"; + LOG(INFO) << "Testing kor" + << "\n"; LoadUnicharset("kor.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. GenericVector transcription; for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) transcription.push_back(i); - GENERIC_2D_ARRAY outputs = - GenerateRandomPaddedOutputs(transcription, kPadding); + GENERIC_2D_ARRAY outputs = GenerateRandomPaddedOutputs(transcription, kPadding); ExpectCorrect(outputs, transcription); } TEST_F(RecodeBeamTest, DoesKannada) { - LOG(INFO) << "Testing kan" << "\n"; + LOG(INFO) << "Testing kan" + << "\n"; LoadUnicharset("kan.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. GenericVector transcription; for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) transcription.push_back(i); - GENERIC_2D_ARRAY outputs = - GenerateRandomPaddedOutputs(transcription, kPadding); + GENERIC_2D_ARRAY outputs = GenerateRandomPaddedOutputs(transcription, kPadding); ExpectCorrect(outputs, transcription); } TEST_F(RecodeBeamTest, DoesMarathi) { - LOG(INFO) << "Testing mar" << "\n"; + LOG(INFO) << "Testing mar" + << "\n"; LoadUnicharset("mar.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. GenericVector transcription; for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) transcription.push_back(i); - GENERIC_2D_ARRAY outputs = - GenerateRandomPaddedOutputs(transcription, kPadding); + GENERIC_2D_ARRAY outputs = GenerateRandomPaddedOutputs(transcription, kPadding); ExpectCorrect(outputs, transcription); } TEST_F(RecodeBeamTest, DoesEnglish) { - LOG(INFO) << "Testing eng" << "\n"; + LOG(INFO) << "Testing eng" + << "\n"; LoadUnicharset("eng.unicharset"); // Correctly reproduce the first kNumchars characters from easy output. GenericVector transcription; for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) transcription.push_back(i); - GENERIC_2D_ARRAY outputs = - GenerateRandomPaddedOutputs(transcription, kPadding); + GENERIC_2D_ARRAY outputs = GenerateRandomPaddedOutputs(transcription, kPadding); ExpectCorrect(outputs, transcription); } TEST_F(RecodeBeamTest, DISABLED_EngDictionary) { - LOG(INFO) << "Testing eng dictionary" << "\n"; + LOG(INFO) << "Testing eng dictionary" + << "\n"; LoadUnicharset("eng_beam.unicharset"); - GENERIC_2D_ARRAY outputs = GenerateSyntheticOutputs( - kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr); + GENERIC_2D_ARRAY outputs = + GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr); std::string default_str; - for (int i = 0; kGWRTops[i] != nullptr; ++i) default_str += kGWRTops[i]; + for (int i = 0; kGWRTops[i] != nullptr; ++i) + default_str += kGWRTops[i]; PointerVector words; ExpectCorrect(outputs, default_str, nullptr, &words); // Now try again with the dictionary. @@ -434,10 +425,11 @@ TEST_F(RecodeBeamTest, DISABLED_EngDictionary) { } TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) { - LOG(INFO) << "Testing zh_hans dictionary" << "\n"; + LOG(INFO) << "Testing zh_hans dictionary" + << "\n"; LoadUnicharset("zh_hans.unicharset"); - GENERIC_2D_ARRAY outputs = GenerateSyntheticOutputs( - kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr); + GENERIC_2D_ARRAY outputs = + GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr); PointerVector words; ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words); // Each is an individual word, with permuter = top choice. @@ -451,11 +443,10 @@ TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) { // Number of words expected. const int kNumWords = 5; // Content of the words. - const char* kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"}; + const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"}; // Permuters of the words. - const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, - TOP_CHOICE_PERM, TOP_CHOICE_PERM, - SYSTEM_DAWG_PERM}; + const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM, + TOP_CHOICE_PERM, SYSTEM_DAWG_PERM}; EXPECT_EQ(kNumWords, words.size()); for (int w = 0; w < kNumWords && w < words.size(); ++w) { EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str()); @@ -466,18 +457,18 @@ TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) { // Tests that a recoder built with decomposed unicode allows true ctc // arbitrary duplicates and inserted nulls inside the multicode sequence. TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) { - LOG(INFO) << "Testing duplicates in multi-code sequences" << "\n"; + LOG(INFO) << "Testing duplicates in multi-code sequences" + << "\n"; LoadUnicharset("vie.d.unicharset"); tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset); TRand random; - GENERIC_2D_ARRAY outputs = GenerateSyntheticOutputs( - kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random); + GENERIC_2D_ARRAY outputs = + GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random); PointerVector words; std::string truth_str; - tesseract::NormalizeUTF8String( - tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, - tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str); + tesseract::NormalizeUTF8String(tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, + tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str); ExpectCorrect(outputs, truth_str, nullptr, &words); } -} // namespace +} // namespace tesseract diff --git a/unittest/rect_test.cc b/unittest/rect_test.cc index 5d9d439f..c1f60f48 100644 --- a/unittest/rect_test.cc +++ b/unittest/rect_test.cc @@ -16,7 +16,7 @@ namespace tesseract { class TBOXTest : public testing::Test { - public: +public: void SetUp() { std::locale::global(std::locale("")); } @@ -52,10 +52,8 @@ TEST_F(TBOXTest, OverlapFractionCorners) { TBOX top_left(5, 25, 15, 35); // other corners covered by symmetry - EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), - mid.overlap_fraction(bottom_left)); - EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), - bottom_left.overlap_fraction(mid)); + EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(bottom_left)); + EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), bottom_left.overlap_fraction(mid)); EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(top_left)); EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), top_left.overlap_fraction(mid)); } @@ -102,14 +100,10 @@ TEST_F(TBOXTest, OverlapFractionSpan) { TBOX horizontal(5, 15, 35, 25); // other sides covered by symmetry in other test cases - EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0), - mid.overlap_fraction(vertical)); - EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0), - vertical.overlap_fraction(mid)); - EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0), - mid.overlap_fraction(horizontal)); - EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0), - horizontal.overlap_fraction(mid)); + EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0), mid.overlap_fraction(vertical)); + EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0), vertical.overlap_fraction(mid)); + EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(horizontal)); + EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0), horizontal.overlap_fraction(mid)); } // TODO(nbeato): pretty much all cases @@ -173,4 +167,4 @@ TEST_F(TBOXTest, OverlapYFractionZeroSize) { EXPECT_DOUBLE_EQ(0.0, small.y_overlap_fraction(zero)); } -} // namespace +} // namespace tesseract diff --git a/unittest/resultiterator_test.cc b/unittest/resultiterator_test.cc index dfaa352c..6af52311 100644 --- a/unittest/resultiterator_test.cc +++ b/unittest/resultiterator_test.cc @@ -1,14 +1,14 @@ -#include -#include #include #include +#include +#include #include "genericvector.h" #include "scrollview.h" +#include "absl/strings/str_format.h" // for absl::StrFormat #include "include_gunit.h" -#include "log.h" // for LOG -#include "absl/strings/str_format.h" // for absl::StrFormat +#include "log.h" // for LOG namespace tesseract { @@ -17,39 +17,43 @@ namespace tesseract { // Helper functions for converting to STL vectors template -void ToVector(const GenericVector& from, std::vector* to) { +void ToVector(const GenericVector &from, std::vector *to) { to->clear(); - for (int i = 0; i < from.size(); i++) to->push_back(from[i]); + for (int i = 0; i < from.size(); i++) + to->push_back(from[i]); } template -void ToVector(const std::vector& from, std::vector* to) { +void ToVector(const std::vector &from, std::vector *to) { to->clear(); - for (int i = 0; i < from.size(); i++) to->push_back(from[i]); + for (int i = 0; i < from.size(); i++) + to->push_back(from[i]); } // The fixture for testing Tesseract. class ResultIteratorTest : public testing::Test { - protected: - std::string TestDataNameToPath(const std::string& name) { - return file::JoinPath(TESTING_DIR , name); +protected: + std::string TestDataNameToPath(const std::string &name) { + return file::JoinPath(TESTING_DIR, name); } std::string TessdataPath() { return file::JoinPath(TESSDATA_DIR, ""); } - std::string OutputNameToPath(const std::string& name) { + std::string OutputNameToPath(const std::string &name) { file::MakeTmpdir(); return file::JoinPath(FLAGS_test_tmpdir, name); } - ResultIteratorTest() { src_pix_ = nullptr; } + ResultIteratorTest() { + src_pix_ = nullptr; + } ~ResultIteratorTest() {} - void SetImage(const char* filename) { + void SetImage(const char *filename) { src_pix_ = pixRead(TestDataNameToPath(filename).c_str()); api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY); -// if (!FLAGS_tess_config.empty()) -// api_.ReadConfigFile(FLAGS_tess_config.c_str()); + // if (!FLAGS_tess_config.empty()) + // api_.ReadConfigFile(FLAGS_tess_config.c_str()); api_.SetPageSegMode(tesseract::PSM_AUTO); api_.SetImage(src_pix_); pixDestroy(&src_pix_); @@ -59,14 +63,15 @@ class ResultIteratorTest : public testing::Test { // Rebuilds the image using the binary images at the given level, and // EXPECTs that the number of pixels in the xor of the rebuilt image with // the original is at most max_diff. - void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator* it) { + void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator *it) { it->Begin(); int width = pixGetWidth(src_pix_); int height = pixGetHeight(src_pix_); int depth = pixGetDepth(src_pix_); - Pix* pix = pixCreate(width, height, depth); + Pix *pix = pixCreate(width, height, depth); EXPECT_TRUE(depth == 1 || depth == 8); - if (depth == 8) pixSetAll(pix); + if (depth == 8) + pixSetAll(pix); do { int left, top, right, bottom; PageIteratorLevel im_level = level; @@ -75,33 +80,32 @@ class ResultIteratorTest : public testing::Test { im_level = tesseract::RIL_BLOCK; EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom)); } - LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right - << ", B:" << bottom << "]" << "\n"; - Pix* block_pix; + LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right << ", B:" << bottom + << "]" + << "\n"; + Pix *block_pix; if (depth == 1) { block_pix = it->GetBinaryImage(im_level); - pixRasterop(pix, left, top, right - left, bottom - top, - PIX_SRC ^ PIX_DST, block_pix, 0, 0); + pixRasterop(pix, left, top, right - left, bottom - top, PIX_SRC ^ PIX_DST, block_pix, 0, 0); } else { block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top); - pixRasterop(pix, left, top, pixGetWidth(block_pix), - pixGetHeight(block_pix), PIX_SRC & PIX_DST, block_pix, 0, - 0); + pixRasterop(pix, left, top, pixGetWidth(block_pix), pixGetHeight(block_pix), + PIX_SRC & PIX_DST, block_pix, 0, 0); } CHECK(block_pix != nullptr); pixDestroy(&block_pix); } while (it->Next(level)); -// if (base::GetFlag(FLAGS_v) >= 1) -// pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG); + // if (base::GetFlag(FLAGS_v) >= 1) + // pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG); pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0); if (depth == 8) { - Pix* binary_pix = pixThresholdToBinary(pix, 128); + Pix *binary_pix = pixThresholdToBinary(pix, 128); pixDestroy(&pix); pixInvert(binary_pix, binary_pix); pix = binary_pix; } -// if (base::GetFlag(FLAGS_v) >= 1) -// pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG); + // if (base::GetFlag(FLAGS_v) >= 1) + // pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG); l_int32 pixcount; pixCountPixels(pix, &pixcount, nullptr); if (pixcount > max_diff) { @@ -112,18 +116,17 @@ class ResultIteratorTest : public testing::Test { pixDestroy(&pix); LOG(INFO) << absl::StrFormat("At level %d: pix diff = %d\n", level, pixcount); EXPECT_LE(pixcount, max_diff); -// if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff); + // if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff); } // Rebuilds the text from the iterator strings at the given level, and // EXPECTs that the rebuild string exactly matches the truth string. - void VerifyIteratorText(const std::string& truth, PageIteratorLevel level, - ResultIterator* it) { + void VerifyIteratorText(const std::string &truth, PageIteratorLevel level, ResultIterator *it) { LOG(INFO) << "Text Test Level " << level << "\n"; it->Begin(); std::string result; do { - char* text = it->GetUTF8Text(level); + char *text = it->GetUTF8Text(level); result += text; delete[] text; if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) && @@ -134,16 +137,15 @@ class ResultIteratorTest : public testing::Test { result += ' '; } if (it->IsAtFinalElement(tesseract::RIL_PARA, level) && - !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) - result += '\n'; + !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) + result += '\n'; } } while (it->Next(level)); - EXPECT_STREQ(truth.c_str(), result.c_str()) - << "Rebuild failed at Text Level " << level; + EXPECT_STREQ(truth.c_str(), result.c_str()) << "Rebuild failed at Text Level " << level; } - void VerifyRebuilds(int block_limit, int para_limit, int line_limit, - int word_limit, int symbol_limit, PageIterator* it) { + void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_limit, + int symbol_limit, PageIterator *it) { VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it); VerifyRebuild(para_limit, tesseract::RIL_PARA, it); VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it); @@ -151,7 +153,7 @@ class ResultIteratorTest : public testing::Test { VerifyRebuild(symbol_limit, tesseract::RIL_SYMBOL, it); } - void VerifyAllText(const std::string& truth, ResultIterator* it) { + void VerifyAllText(const std::string &truth, ResultIterator *it) { VerifyIteratorText(truth, tesseract::RIL_BLOCK, it); VerifyIteratorText(truth, tesseract::RIL_PARA, it); VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it); @@ -164,9 +166,8 @@ class ResultIteratorTest : public testing::Test { // expected output reading order // (expected_reading_order[num_reading_order_entries]) and a given reading // context (ltr or rtl). - void ExpectTextlineReadingOrder(bool in_ltr_context, - const StrongScriptDirection* word_dirs, - int num_words, int* expected_reading_order, + void ExpectTextlineReadingOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs, + int num_words, int *expected_reading_order, int num_reading_order_entries) const { std::vector gv_word_dirs; for (int i = 0; i < num_words; i++) { @@ -174,12 +175,10 @@ class ResultIteratorTest : public testing::Test { } std::vector output; - ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, - &output); + ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &output); // STL vector can be used with EXPECT_EQ, so convert... - std::vector correct_order( - expected_reading_order, - expected_reading_order + num_reading_order_entries); + std::vector correct_order(expected_reading_order, + expected_reading_order + num_reading_order_entries); std::vector calculated_order; ToVector(output, &calculated_order); EXPECT_EQ(correct_order, calculated_order); @@ -189,8 +188,7 @@ class ResultIteratorTest : public testing::Test { // for a given array of word_dirs[num_words] in ltr or rtl context. // Sane means that the output contains some permutation of the indices // 0..[num_words - 1] interspersed optionally with negative (marker) values. - void VerifySaneTextlineOrder(bool in_ltr_context, - const StrongScriptDirection* word_dirs, + void VerifySaneTextlineOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs, int num_words) const { std::vector gv_word_dirs; for (int i = 0; i < num_words; i++) { @@ -198,14 +196,14 @@ class ResultIteratorTest : public testing::Test { } std::vector output; - ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, - &output); + ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &output); ASSERT_GE(output.size(), num_words); std::vector output_copy(output); std::sort(output_copy.begin(), output_copy.end()); bool sane = true; int j = 0; - while (j < output_copy.size() && output_copy[j] < 0) j++; + while (j < output_copy.size() && output_copy[j] < 0) + j++; for (int i = 0; i < num_words; i++, j++) { if (output_copy[j] != i) { sane = false; @@ -218,14 +216,13 @@ class ResultIteratorTest : public testing::Test { if (!sane) { std::vector output_copy2, empty; ToVector(output, &output_copy2); - EXPECT_EQ(output_copy2, empty) - << " permutation of 0.." << num_words - 1 << " not found in " - << (in_ltr_context ? "ltr" : "rtl") << " context."; + EXPECT_EQ(output_copy2, empty) << " permutation of 0.." << num_words - 1 << " not found in " + << (in_ltr_context ? "ltr" : "rtl") << " context."; } } // Objects declared here can be used by all tests in the test case for Foo. - Pix* src_pix_; // Borrowed from api_. Do not destroy. + Pix *src_pix_; // Borrowed from api_. Do not destroy. std::string ocr_text_; tesseract::TessBaseAPI api_; }; @@ -233,7 +230,7 @@ class ResultIteratorTest : public testing::Test { // Tests layout analysis output (and scrollview) on the UNLV page numbered // 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true. // -//TEST_F(ResultIteratorTest, VisualTest) { +// TEST_F(ResultIteratorTest, VisualTest) { // if (!FLAGS_visual_test) return; // const char* kIms[] = {"8087_054.3G.tif", "8071_093.3B.tif", nullptr}; // for (int i = 0; kIms[i] != nullptr; ++i) { @@ -276,7 +273,7 @@ class ResultIteratorTest : public testing::Test { TEST_F(ResultIteratorTest, EasyTest) { SetImage("phototest.tif"); // Just run layout analysis. - PageIterator* p_it = api_.AnalyseLayout(); + PageIterator *p_it = api_.AnalyseLayout(); EXPECT_FALSE(p_it == nullptr); // Check iterator position. EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK)); @@ -285,23 +282,27 @@ TEST_F(ResultIteratorTest, EasyTest) { EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK)); // The images should rebuild almost perfectly. - LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" << "\n"; + LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" + << "\n"; VerifyRebuilds(10, 10, 0, 0, 0, p_it); delete p_it; - char* result = api_.GetUTF8Text(); + char *result = api_.GetUTF8Text(); ocr_text_ = result; delete[] result; - ResultIterator* r_it = api_.GetIterator(); + ResultIterator *r_it = api_.GetIterator(); // The images should rebuild almost perfectly. - LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" << "\n"; + LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" + << "\n"; VerifyRebuilds(8, 8, 0, 0, 40, r_it); // Test the text. - LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" << "\n"; + LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" + << "\n"; VerifyAllText(ocr_text_, r_it); // The images should rebuild almost perfectly. - LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" << "\n"; + LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" + << "\n"; VerifyRebuilds(8, 8, 0, 0, 40, r_it); r_it->Begin(); @@ -325,14 +326,14 @@ TEST_F(ResultIteratorTest, EasyTest) { do { bool bold, italic, underlined, monospace, serif, smallcaps; int pointsize, font_id; - const char* font = - r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, - &serif, &smallcaps, &pointsize, &font_id); + const char *font = r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, + &smallcaps, &pointsize, &font_id); float confidence = r_it->Confidence(tesseract::RIL_WORD); EXPECT_GE(confidence, 80.0f); - char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); - LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g", - word_str, font, font_id, pointsize, confidence) << "\n"; + char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g", word_str, font, + font_id, pointsize, confidence) + << "\n"; delete[] word_str; EXPECT_FALSE(bold); EXPECT_FALSE(italic); @@ -352,7 +353,7 @@ TEST_F(ResultIteratorTest, EasyTest) { TEST_F(ResultIteratorTest, ComplexTest) { SetImage("8087_054.3B.tif"); // Just run layout analysis. - PageIterator* it = api_.AnalyseLayout(); + PageIterator *it = api_.AnalyseLayout(); EXPECT_FALSE(it == nullptr); // The images should rebuild almost perfectly. VerifyRebuilds(2073, 2073, 2080, 2081, 2090, it); @@ -363,7 +364,7 @@ TEST_F(ResultIteratorTest, ComplexTest) { TEST_F(ResultIteratorTest, GreyTest) { SetImage("8087_054.3G.tif"); // Just run layout analysis. - PageIterator* it = api_.AnalyseLayout(); + PageIterator *it = api_.AnalyseLayout(); EXPECT_FALSE(it == nullptr); // The images should rebuild almost perfectly. VerifyRebuilds(600, 600, 600, 600, 600, it); @@ -373,9 +374,9 @@ TEST_F(ResultIteratorTest, GreyTest) { // Tests that Tesseract gets smallcaps and dropcaps. TEST_F(ResultIteratorTest, SmallCapDropCapTest) { SetImage("8071_093.3B.tif"); - char* result = api_.GetUTF8Text(); + char *result = api_.GetUTF8Text(); delete[] result; - ResultIterator* r_it = api_.GetIterator(); + ResultIterator *r_it = api_.GetIterator(); // Iterate over the words. int found_dropcaps = 0; int found_smallcaps = 0; @@ -383,32 +384,30 @@ TEST_F(ResultIteratorTest, SmallCapDropCapTest) { do { bool bold, italic, underlined, monospace, serif, smallcaps; int pointsize, font_id; - r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, - &smallcaps, &pointsize, &font_id); - char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, + &pointsize, &font_id); + char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); if (word_str != nullptr) { - LOG(INFO) << absl::StrFormat("Word %s is %s", word_str, - smallcaps ? "SMALLCAPS" : "Normal") << "\n"; + LOG(INFO) << absl::StrFormat("Word %s is %s", word_str, smallcaps ? "SMALLCAPS" : "Normal") + << "\n"; if (r_it->SymbolIsDropcap()) { ++found_dropcaps; } if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 || - strcmp(word_str, "RALPH") == 0 || - strcmp(word_str, "KINNEY") == 0 || // Not working yet. + strcmp(word_str, "RALPH") == 0 || strcmp(word_str, "KINNEY") == 0 || // Not working yet. strcmp(word_str, "BENNETT") == 0) { EXPECT_TRUE(smallcaps) << word_str; ++found_smallcaps; } else { - if (smallcaps) ++false_positives; + if (smallcaps) + ++false_positives; } // No symbol other than the first of any word should be dropcap. ResultIterator s_it(*r_it); - while (s_it.Next(tesseract::RIL_SYMBOL) && - !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) { + while (s_it.Next(tesseract::RIL_SYMBOL) && !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) { if (s_it.SymbolIsDropcap()) { - char* sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL); - LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str, - word_str); + char *sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL); + LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str, word_str); delete[] sym_str; } EXPECT_FALSE(s_it.SymbolIsDropcap()); @@ -480,17 +479,12 @@ TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) { const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR}; int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart, 0, 1, 2, 3, ResultIterator::kMinorRunEnd}; - int reading_order_ltr_context[] = {0, 1, - 2, 3, - 4, ResultIterator::kMinorRunStart, - 7, 6, - 5, ResultIterator::kMinorRunEnd}; + int reading_order_ltr_context[] = { + 0, 1, 2, 3, 4, ResultIterator::kMinorRunStart, 7, 6, 5, ResultIterator::kMinorRunEnd}; - ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), - reading_order_ltr_context, + ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context, countof(reading_order_ltr_context)); - ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), - reading_order_rtl_context, + ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context, countof(reading_order_rtl_context)); } @@ -502,15 +496,12 @@ TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) { int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7}; // In the strange event that this shows up in an RTL paragraph, nonetheless // just presume the whole thing is an LTR line. - int reading_order_rtl_context[] = { - ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7, - ResultIterator::kMinorRunEnd}; + int reading_order_rtl_context[] = {ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7, + ResultIterator::kMinorRunEnd}; - ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), - reading_order_ltr_context, + ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context, countof(reading_order_ltr_context)); - ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), - reading_order_rtl_context, + ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context, countof(reading_order_rtl_context)); } @@ -520,8 +511,7 @@ TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) { const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR}; // The order here is just right-to-left, nothing fancy. int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0}; - ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), - reading_order_rtl_context, + ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context, countof(reading_order_rtl_context)); } @@ -529,7 +519,7 @@ TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) { // Iterate through all 7-word sequences and make sure that the output // contains each of the indices 0..6 exactly once. const int kNumWords(7); - const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations + const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations StrongScriptDirection word_dirs[kNumWords]; for (int i = 0; i < kNumCombos; i++) { // generate the next combination. @@ -546,28 +536,28 @@ TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) { // TODO: Missing image TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) { SetImage("5318c4b679264.jpg"); - char* result = api_.GetUTF8Text(); + char *result = api_.GetUTF8Text(); delete[] result; - ResultIterator* r_it = api_.GetIterator(); + ResultIterator *r_it = api_.GetIterator(); // Iterate over the words. do { - char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); if (word_str != nullptr) { LOG(INFO) << absl::StrFormat("Word %s:", word_str) << "\n"; ResultIterator s_it = *r_it; do { tesseract::ChoiceIterator c_it(s_it); do { - const char* char_str = c_it.GetUTF8Text(); + const char *char_str = c_it.GetUTF8Text(); if (char_str == nullptr) - LOG(INFO) << "Null char choice" << "\n"; + LOG(INFO) << "Null char choice" + << "\n"; else LOG(INFO) << "Char choice " << char_str << "\n"; CHECK(char_str != nullptr); } while (c_it.Next()); - } while ( - !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) && - s_it.Next(tesseract::RIL_SYMBOL)); + } while (!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) && + s_it.Next(tesseract::RIL_SYMBOL)); delete[] word_str; } } while (r_it->Next(tesseract::RIL_WORD)); @@ -576,37 +566,36 @@ TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) { // TODO: Missing image TEST_F(ResultIteratorTest, NonNullConfidencesTest) { -// SetImage("line6.tiff"); + // SetImage("line6.tiff"); SetImage("trainingitalline.tif"); api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK); // Force recognition so we can used the result iterator. // We don't care about the return from GetUTF8Text. - char* result = api_.GetUTF8Text(); + char *result = api_.GetUTF8Text(); delete[] result; - ResultIterator* r_it = api_.GetIterator(); + ResultIterator *r_it = api_.GetIterator(); // Iterate over the words. do { - char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); if (word_str != nullptr) { EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD)); EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL)); ResultIterator s_it = *r_it; do { - const char* char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL); + const char *char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL); CHECK(char_str != nullptr); float confidence = s_it.Confidence(tesseract::RIL_SYMBOL); - LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str, - confidence); + LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str, confidence); delete[] char_str; - } while ( - !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) && - s_it.Next(tesseract::RIL_SYMBOL)); + } while (!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) && + s_it.Next(tesseract::RIL_SYMBOL)); delete[] word_str; } else { - LOG(INFO) << "Empty word found" << "\n"; + LOG(INFO) << "Empty word found" + << "\n"; } } while (r_it->Next(tesseract::RIL_WORD)); delete r_it; } -} // namespace +} // namespace tesseract diff --git a/unittest/scanutils_test.cc b/unittest/scanutils_test.cc index e6917fce..0bf8ad51 100644 --- a/unittest/scanutils_test.cc +++ b/unittest/scanutils_test.cc @@ -9,7 +9,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // for cout +#include // for cout #include "include_gunit.h" #include "scanutils.h" @@ -17,9 +17,8 @@ namespace tesseract { class ScanutilsTest : public ::testing::Test { - protected: - void SetUp() override { - } +protected: + void SetUp() override {} }; TEST_F(ScanutilsTest, DoesScanf) { @@ -27,12 +26,12 @@ TEST_F(ScanutilsTest, DoesScanf) { // There are probably a gazillion more test cases that could be added, but // these brought the tesseract and unittest test results in line. std::string filename = file::JoinPath(TESTDATA_DIR, "scanftest.txt"); - FILE* fp1 = fopen(filename.c_str(), "r"); + FILE *fp1 = fopen(filename.c_str(), "r"); if (fp1 == nullptr) { std::cout << "Failed to open file " << filename << '\n'; GTEST_SKIP(); } - FILE* fp2 = fopen(filename.c_str(), "r"); + FILE *fp2 = fopen(filename.c_str(), "r"); if (fp2 == nullptr) { std::cout << "Failed to open file " << filename << '\n'; fclose(fp1); @@ -95,7 +94,8 @@ TEST_F(ScanutilsTest, DoesScanf) { r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]); r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]); EXPECT_EQ(r1, r2); - for (int i = 0; i < kNumFloats; ++i) EXPECT_FLOAT_EQ(f1[i], f2[i]); + for (int i = 0; i < kNumFloats; ++i) + EXPECT_FLOAT_EQ(f1[i], f2[i]); // Test the * for field suppression. r1 = fscanf(fp1, "%d %*s %*d %*f %*f", &i1[0]); r2 = tfscanf(fp2, "%d %*s %*d %*f %*f", &i2[0]); @@ -111,4 +111,4 @@ TEST_F(ScanutilsTest, DoesScanf) { fclose(fp1); } -} // namespace +} // namespace tesseract diff --git a/unittest/shapetable_test.cc b/unittest/shapetable_test.cc index 285ed833..17ae502a 100644 --- a/unittest/shapetable_test.cc +++ b/unittest/shapetable_test.cc @@ -12,7 +12,7 @@ #include #include -#include "absl/strings/str_format.h" // for absl::StrFormat +#include "absl/strings/str_format.h" // for absl::StrFormat #include "include_gunit.h" @@ -24,19 +24,19 @@ namespace tesseract { #ifndef DISABLED_LEGACY_ENGINE -static std::string TmpNameToPath(const std::string& name) { +static std::string TmpNameToPath(const std::string &name) { return file::JoinPath(FLAGS_test_tmpdir, name); } // Sets up a simple shape with some unichars. -static void Setup352(int font_id, Shape* shape) { +static void Setup352(int font_id, Shape *shape) { shape->AddToShape(3, font_id); shape->AddToShape(5, font_id); shape->AddToShape(2, font_id); } // Verifies some properties of the 352 shape. -static void Expect352(int font_id, const Shape& shape) { +static void Expect352(int font_id, const Shape &shape) { EXPECT_EQ(3, shape.size()); EXPECT_TRUE(shape.ContainsUnichar(2)); EXPECT_TRUE(shape.ContainsUnichar(3)); @@ -53,7 +53,7 @@ static void Expect352(int font_id, const Shape& shape) { // The fixture for testing Shape. class ShapeTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); @@ -72,7 +72,7 @@ TEST_F(ShapeTest, BasicTest) { Expect352(101, shape1); // It should still work after file I/O. std::string filename = TmpNameToPath("shapefile"); - FILE* fp = fopen(filename.c_str(), "wb"); + FILE *fp = fopen(filename.c_str(), "wb"); ASSERT_TRUE(fp != nullptr); EXPECT_TRUE(shape1.Serialize(fp)); fclose(fp); @@ -103,9 +103,9 @@ TEST_F(ShapeTest, AddShapeTest) { Expect352(101, shape1); // Now setup a different shape with different content. Shape shape2; - shape2.AddToShape(3, 101); // Duplicates shape1. - shape2.AddToShape(5, 110); // Different font to shape1. - shape2.AddToShape(7, 101); // Different unichar to shape1. + shape2.AddToShape(3, 101); // Duplicates shape1. + shape2.AddToShape(5, 110); // Different font to shape1. + shape2.AddToShape(7, 101); // Different unichar to shape1. // They should NOT be subsets of each other. EXPECT_FALSE(shape1.IsSubsetOf(shape2)); EXPECT_FALSE(shape2.IsSubsetOf(shape1)); @@ -179,4 +179,4 @@ TEST_F(ShapeTableTest, FullTest) { #endif } -} // namespace +} // namespace tesseract diff --git a/unittest/stats_test.cc b/unittest/stats_test.cc index 72219b84..25373aa7 100644 --- a/unittest/stats_test.cc +++ b/unittest/stats_test.cc @@ -20,7 +20,7 @@ namespace tesseract { const int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1}; class STATSTest : public testing::Test { - public: +public: void SetUp() { std::locale::global(std::locale("")); stats_.set_range(0, 16); @@ -56,4 +56,4 @@ TEST_F(STATSTest, TopNModes) { EXPECT_EQ(6, modes[2].data()); } -} // namespace. +} // namespace tesseract diff --git a/unittest/stridemap_test.cc b/unittest/stridemap_test.cc index fa1ef234..131bc79e 100644 --- a/unittest/stridemap_test.cc +++ b/unittest/stridemap_test.cc @@ -10,9 +10,9 @@ // limitations under the License. #ifdef INCLUDE_TENSORFLOW -#include // for xla::Array2D +# include // for xla::Array2D #else -#include // std::array +# include // std::array #endif #include "include_gunit.h" #include "stridemap.h" @@ -24,20 +24,18 @@ namespace xla { template class Array2D : public std::vector { - public: +public: Array2D() : std::vector(std::vector{0, 0}) {} - Array2D(const int64_t n1, const int64_t n2) - : std::vector(std::vector{n1, n2}) {} + Array2D(const int64_t n1, const int64_t n2) : std::vector(std::vector{n1, n2}) {} - Array2D(const int64_t n1, const int64_t n2, const T value) - : std::vector({n1, n2}, value) {} + Array2D(const int64_t n1, const int64_t n2, const T value) : std::vector({n1, n2}, value) {} }; -} +} // namespace xla #endif class StridemapTest : public ::testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } @@ -50,11 +48,11 @@ class StridemapTest : public ::testing::Test { int value = start; for (int y = 0; y < ysize; ++y) { for (int x = 0; x < xsize; ++x) { -#ifdef INCLUDE_TENSORFLOW +# ifdef INCLUDE_TENSORFLOW (*a)(y, x) = value++; -#else +# else a[y][x] = value++; -#endif +# endif } } return a; @@ -81,17 +79,13 @@ TEST_F(StridemapTest, Indexing) { int pos = 0; do { EXPECT_GE(index.t(), pos); - EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), - index.index(FD_WIDTH)), + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)), pos); - EXPECT_EQ(index.IsLast(FD_BATCH), - index.index(FD_BATCH) == arrays.size() - 1); - EXPECT_EQ( - index.IsLast(FD_HEIGHT), - index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1); - EXPECT_EQ( - index.IsLast(FD_WIDTH), - index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1); + EXPECT_EQ(index.IsLast(FD_BATCH), index.index(FD_BATCH) == arrays.size() - 1); + EXPECT_EQ(index.IsLast(FD_HEIGHT), + index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1); + EXPECT_EQ(index.IsLast(FD_WIDTH), + index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1); EXPECT_TRUE(index.IsValid()); ++pos; } while (index.Increment()); @@ -100,8 +94,7 @@ TEST_F(StridemapTest, Indexing) { do { --pos; EXPECT_GE(index.t(), pos); - EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), - index.index(FD_WIDTH)), + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)), pos); StrideMap::Index copy(index); // Since a change in batch index changes the height and width, it isn't @@ -139,10 +132,10 @@ TEST_F(StridemapTest, Scaling) { // scaling/reduction functions work as expected. #ifdef INCLUDE_TENSORFLOW std::vector>> arrays; - arrays.push_back(SetupArray(3, 4, 0)); // 0-11 - arrays.push_back(SetupArray(4, 5, 12)); // 12-31 - arrays.push_back(SetupArray(4, 4, 32)); // 32-47 - arrays.push_back(SetupArray(3, 5, 48)); // 48-62 + arrays.push_back(SetupArray(3, 4, 0)); // 0-11 + arrays.push_back(SetupArray(4, 5, 12)); // 12-31 + arrays.push_back(SetupArray(4, 4, 32)); // 32-47 + arrays.push_back(SetupArray(3, 5, 48)); // 48-62 std::vector> h_w_sizes; for (size_t i = 0; i < arrays.size(); ++i) { h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width()); @@ -151,33 +144,29 @@ TEST_F(StridemapTest, Scaling) { stride_map.SetStride(h_w_sizes); // Scale x by 2, keeping y the same. - std::vector values_x2 = {0, 1, 4, 5, 8, 9, 12, 13, 17, 18, - 22, 23, 27, 28, 32, 33, 36, 37, 40, 41, - 44, 45, 48, 49, 53, 54, 58, 59}; + std::vector values_x2 = {0, 1, 4, 5, 8, 9, 12, 13, 17, 18, 22, 23, 27, 28, + 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 53, 54, 58, 59}; StrideMap test_map(stride_map); test_map.ScaleXY(2, 1); StrideMap::Index index(test_map); int pos = 0; do { int expected_value = values_x2[pos++]; - EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), - index.index(FD_WIDTH)), + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)), expected_value); } while (index.Increment()); EXPECT_EQ(pos, values_x2.size()); test_map = stride_map; // Scale y by 2, keeping x the same. - std::vector values_y2 = {0, 1, 2, 3, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 32, 33, 34, 35, - 36, 37, 38, 39, 48, 49, 50, 51, 52}; + std::vector values_y2 = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52}; test_map.ScaleXY(1, 2); index.InitToFirst(); pos = 0; do { int expected_value = values_y2[pos++]; - EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), - index.index(FD_WIDTH)), + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)), expected_value); } while (index.Increment()); EXPECT_EQ(pos, values_y2.size()); @@ -190,23 +179,20 @@ TEST_F(StridemapTest, Scaling) { pos = 0; do { int expected_value = values_xy2[pos++]; - EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), - index.index(FD_WIDTH)), + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)), expected_value); } while (index.Increment()); EXPECT_EQ(pos, values_xy2.size()); test_map = stride_map; // Reduce Width to 1. - std::vector values_x_to_1 = {0, 4, 8, 12, 17, 22, 27, - 32, 36, 40, 44, 48, 53, 58}; + std::vector values_x_to_1 = {0, 4, 8, 12, 17, 22, 27, 32, 36, 40, 44, 48, 53, 58}; test_map.ReduceWidthTo1(); index.InitToFirst(); pos = 0; do { int expected_value = values_x_to_1[pos++]; - EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), - index.index(FD_WIDTH)), + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)), expected_value); } while (index.Increment()); EXPECT_EQ(pos, values_x_to_1.size()); @@ -216,4 +202,4 @@ TEST_F(StridemapTest, Scaling) { #endif } -} // namespace +} // namespace tesseract diff --git a/unittest/stringrenderer_test.cc b/unittest/stringrenderer_test.cc index f261fd13..75ccdcaf 100644 --- a/unittest/stringrenderer_test.cc +++ b/unittest/stringrenderer_test.cc @@ -17,8 +17,8 @@ #include "stringrenderer.h" #include "strngs.h" -#include "absl/strings/str_split.h" // for absl::StrSplit #include +#include "absl/strings/str_split.h" // for absl::StrSplit #include #include @@ -40,10 +40,10 @@ const char kEngNonLigatureText[] = "fidelity"; // Same as kEngNonLigatureText, but with "fi" replaced with its ligature. const char kEngLigatureText[] = "fidelity"; -static PangoFontMap* font_map; +static PangoFontMap *font_map; class StringRendererTest : public ::testing::Test { - protected: +protected: void SetUp() override { if (!font_map) { font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); @@ -62,15 +62,16 @@ class StringRendererTest : public ::testing::Test { PangoFontInfo::SoftInitFontConfig(); // init early } - void DisplayClusterBoxes(Pix* pix) { - if (!FLAGS_display) return; - const std::vector& boxchars = renderer_->GetBoxes(); - Boxa* boxes = boxaCreate(0); - for (const auto& boxchar : boxchars) { + void DisplayClusterBoxes(Pix *pix) { + if (!FLAGS_display) + return; + const std::vector &boxchars = renderer_->GetBoxes(); + Boxa *boxes = boxaCreate(0); + for (const auto &boxchar : boxchars) { if (boxchar->box()) - boxaAddBox(boxes, const_cast(boxchar->box()), L_CLONE); + boxaAddBox(boxes, const_cast(boxchar->box()), L_CLONE); } - Pix* box_pix = pixDrawBoxaRandom(pix, boxes, 1); + Pix *box_pix = pixDrawBoxaRandom(pix, boxes, 1); boxaDestroy(&boxes); pixDisplay(box_pix, 0, 0); pixDestroy(&box_pix); @@ -80,32 +81,28 @@ class StringRendererTest : public ::testing::Test { TEST_F(StringRendererTest, DoesRenderToImage) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); - Pix* pix = nullptr; - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + Pix *pix = nullptr; + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); pixDestroy(&pix); renderer_.reset(new StringRenderer("UnBatang 10", 600, 600)); - EXPECT_EQ(strlen(kKorText), - renderer_->RenderToImage(kKorText, strlen(kKorText), &pix)); + EXPECT_EQ(strlen(kKorText), renderer_->RenderToImage(kKorText, strlen(kKorText), &pix)); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); pixDestroy(&pix); renderer_.reset(new StringRenderer("Lohit Hindi 10", 600, 600)); - EXPECT_EQ(strlen(kHinText), - renderer_->RenderToImage(kHinText, strlen(kHinText), &pix)); + EXPECT_EQ(strlen(kHinText), renderer_->RenderToImage(kHinText, strlen(kHinText), &pix)); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); pixDestroy(&pix); // RTL text renderer_.reset(new StringRenderer("Arab 10", 600, 600)); - EXPECT_EQ(strlen(kArabicText), - renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix)); + EXPECT_EQ(strlen(kArabicText), renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); @@ -113,8 +110,7 @@ TEST_F(StringRendererTest, DoesRenderToImage) { // Mixed direction Arabic + english text renderer_.reset(new StringRenderer("Arab 10", 600, 600)); - EXPECT_EQ(strlen(kMixedText), - renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix)); + EXPECT_EQ(strlen(kMixedText), renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); @@ -126,9 +122,8 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) { // Underline all words but NOT intervening spaces. renderer_->set_underline_start_prob(1.0); renderer_->set_underline_continuation_prob(0); - Pix* pix = nullptr; - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + Pix *pix = nullptr; + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); @@ -138,8 +133,7 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) { // Underline all words AND intervening spaces. renderer_->set_underline_start_prob(1.0); renderer_->set_underline_continuation_prob(1.0); - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); @@ -149,8 +143,7 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) { // Underline words and intervening spaces with 0.5 prob. renderer_->set_underline_start_prob(0.5); renderer_->set_underline_continuation_prob(0.5); - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); @@ -159,13 +152,12 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) { TEST_F(StringRendererTest, DoesHandleNewlineCharacters) { const char kRawText[] = "\n\n\n A \nB \nC \n\n\n"; - const char kStrippedText[] = " A B C "; // text with newline chars removed + const char kStrippedText[] = " A B C "; // text with newline chars removed renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); - Pix* pix = nullptr; - EXPECT_EQ(strlen(kRawText), - renderer_->RenderToImage(kRawText, strlen(kRawText), &pix)); + Pix *pix = nullptr; + EXPECT_EQ(strlen(kRawText), renderer_->RenderToImage(kRawText, strlen(kRawText), &pix)); EXPECT_TRUE(pix != nullptr); - const std::vector& boxchars = renderer_->GetBoxes(); + const std::vector &boxchars = renderer_->GetBoxes(); // 3 characters + 4 spaces => 7 boxes EXPECT_EQ(7, boxchars.size()); if (boxchars.size() == 7) { @@ -182,13 +174,12 @@ TEST_F(StringRendererTest, DoesRenderLigatures) { renderer_.reset(new StringRenderer("Arab 12", 600, 250)); const char kArabicLigature[] = "لا"; - Pix* pix = nullptr; - EXPECT_EQ( - strlen(kArabicLigature), - renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix)); + Pix *pix = nullptr; + EXPECT_EQ(strlen(kArabicLigature), + renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix)); EXPECT_TRUE(pix != nullptr); EXPECT_GT(renderer_->GetBoxes().size(), 0); - const std::vector& boxes = renderer_->GetBoxes(); + const std::vector &boxes = renderer_->GetBoxes(); EXPECT_EQ(1, boxes.size()); EXPECT_TRUE(boxes[0]->box() != nullptr); EXPECT_STREQ(kArabicLigature, boxes[0]->ch().c_str()); @@ -202,17 +193,17 @@ TEST_F(StringRendererTest, DoesRenderLigatures) { pixDestroy(&pix); } -static int FindBoxCharXCoord(const std::vector& boxchars, - const std::string& ch) { - for (const auto& boxchar : boxchars) { - if (boxchar->ch() == ch) return boxchar->box()->x; +static int FindBoxCharXCoord(const std::vector &boxchars, const std::string &ch) { + for (const auto &boxchar : boxchars) { + if (boxchar->ch() == ch) + return boxchar->box()->x; } return INT_MAX; } TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) { renderer_.reset(new StringRenderer("Arab 10", 600, 600)); - Pix* pix = nullptr; + Pix *pix = nullptr; // Arabic letters should be in decreasing x-coordinates const char kArabicWord[] = "\u0644\u0627\u0641\u0643\u0631"; const std::string kRevWord = "\u0631\u0643\u0641\u0627\u0644"; @@ -221,8 +212,7 @@ TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) { // Decode to get the box text strings. EXPECT_FALSE(boxes_str.empty()); std::vector texts; - EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts, - nullptr, nullptr)); + EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts, nullptr, nullptr)); std::string ltr_str; for (size_t i = 0; i < texts.size(); ++i) { ltr_str += texts[i].c_str(); @@ -238,15 +228,14 @@ TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) { TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) { renderer_.reset(new StringRenderer("Arab 10", 600, 600)); - Pix* pix = nullptr; + Pix *pix = nullptr; // Arabic letters should be in decreasing x-coordinates const char kArabicWord[] = "والفكر"; renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix); EXPECT_GT(renderer_->GetBoxes().size(), 0); - const std::vector& boxchars = renderer_->GetBoxes(); + const std::vector &boxchars = renderer_->GetBoxes(); for (size_t i = 1; i < boxchars.size(); ++i) { - EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) - << boxchars[i - 1]->ch(); + EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) << boxchars[i - 1]->ch(); } pixDestroy(&pix); @@ -256,8 +245,7 @@ TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) { renderer_->RenderToImage(kEnglishWord, strlen(kEnglishWord), &pix); EXPECT_EQ(boxchars.size(), strlen(kEnglishWord)); for (size_t i = 1; i < boxchars.size(); ++i) { - EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) - << boxchars[i - 1]->ch(); + EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) << boxchars[i - 1]->ch(); } pixDestroy(&pix); @@ -271,11 +259,10 @@ TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) { } TEST_F(StringRendererTest, DoesRenderVerticalText) { - Pix* pix = nullptr; + Pix *pix = nullptr; renderer_.reset(new StringRenderer("UnBatang 10", 600, 600)); renderer_->set_vertical_text(true); - EXPECT_EQ(strlen(kKorText), - renderer_->RenderToImage(kKorText, strlen(kKorText), &pix)); + EXPECT_EQ(strlen(kKorText), renderer_->RenderToImage(kKorText, strlen(kKorText), &pix)); EXPECT_GT(renderer_->GetBoxes().size(), 0); DisplayClusterBoxes(pix); pixDestroy(&pix); @@ -285,12 +272,11 @@ TEST_F(StringRendererTest, DoesRenderVerticalText) { // appropriate page numbers. TEST_F(StringRendererTest, DoesKeepAllImageBoxes) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); - Pix* pix = nullptr; + Pix *pix = nullptr; int num_boxes_per_page = 0; const int kNumTrials = 2; for (int i = 0; i < kNumTrials; ++i) { - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); EXPECT_TRUE(pix != nullptr); pixDestroy(&pix); EXPECT_GT(renderer_->GetBoxes().size(), 0); @@ -299,8 +285,7 @@ TEST_F(StringRendererTest, DoesKeepAllImageBoxes) { } else { EXPECT_EQ((i + 1) * num_boxes_per_page, renderer_->GetBoxes().size()); } - for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page; - ++j) { + for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page; ++j) { EXPECT_EQ(i, renderer_->GetBoxes()[j]->page()); } } @@ -308,16 +293,14 @@ TEST_F(StringRendererTest, DoesKeepAllImageBoxes) { TEST_F(StringRendererTest, DoesClearBoxes) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); - Pix* pix = nullptr; - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + Pix *pix = nullptr; + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); pixDestroy(&pix); EXPECT_GT(renderer_->GetBoxes().size(), 0); const int num_boxes_per_page = renderer_->GetBoxes().size(); renderer_->ClearBoxes(); - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); pixDestroy(&pix); EXPECT_EQ(num_boxes_per_page, renderer_->GetBoxes().size()); } @@ -325,10 +308,9 @@ TEST_F(StringRendererTest, DoesClearBoxes) { TEST_F(StringRendererTest, DoesLigatureTextForRendering) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); renderer_->set_add_ligatures(true); - Pix* pix = nullptr; + Pix *pix = nullptr; EXPECT_EQ(strlen(kEngNonLigatureText), - renderer_->RenderToImage(kEngNonLigatureText, - strlen(kEngNonLigatureText), &pix)); + renderer_->RenderToImage(kEngNonLigatureText, strlen(kEngNonLigatureText), &pix)); pixDestroy(&pix); // There should be one less box than letters due to the 'fi' ligature. EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size()); @@ -338,10 +320,9 @@ TEST_F(StringRendererTest, DoesLigatureTextForRendering) { TEST_F(StringRendererTest, DoesRetainInputLigatureForRendering) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); - Pix* pix = nullptr; + Pix *pix = nullptr; EXPECT_EQ(strlen(kEngLigatureText), - renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText), - &pix)); + renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText), &pix)); pixDestroy(&pix); // There should be one less box than letters due to the 'fi' ligature. EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size()); @@ -361,16 +342,14 @@ TEST_F(StringRendererTest, DoesStripUnrenderableWords) { TEST_F(StringRendererTest, DoesRenderWordBoxes) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); renderer_->set_output_word_boxes(true); - Pix* pix = nullptr; - EXPECT_EQ(strlen(kEngText), - renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + Pix *pix = nullptr; + EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); pixDestroy(&pix); // Verify #boxchars = #words + #spaces - std::vector words = - absl::StrSplit(kEngText, ' ', absl::SkipEmpty()); + std::vector words = absl::StrSplit(kEngText, ' ', absl::SkipEmpty()); const int kNumSpaces = words.size() - 1; const int kExpectedNumBoxes = words.size() + kNumSpaces; - const std::vector& boxchars = renderer_->GetBoxes(); + const std::vector &boxchars = renderer_->GetBoxes(); EXPECT_EQ(kExpectedNumBoxes, boxchars.size()); // Verify content of words and spaces for (size_t i = 0; i < boxchars.size(); i += 2) { @@ -385,17 +364,16 @@ TEST_F(StringRendererTest, DoesRenderWordBoxes) { TEST_F(StringRendererTest, DoesRenderWordBoxesFromMultiLineText) { renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); renderer_->set_output_word_boxes(true); - Pix* pix = nullptr; + Pix *pix = nullptr; const char kMultlineText[] = "the quick brown fox\njumps over the lazy dog"; - EXPECT_EQ(strlen(kMultlineText), - renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix)); + EXPECT_EQ(strlen(kMultlineText), renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix)); pixDestroy(&pix); // Verify #boxchars = #words + #spaces + #newlines std::vector words = absl::StrSplit(kMultlineText, absl::ByAnyChar(" \n"), absl::SkipEmpty()); const int kNumSeparators = words.size() - 1; const int kExpectedNumBoxes = words.size() + kNumSeparators; - const std::vector& boxchars = renderer_->GetBoxes(); + const std::vector &boxchars = renderer_->GetBoxes(); EXPECT_EQ(kExpectedNumBoxes, boxchars.size()); // Verify content of words and spaces for (size_t i = 0; i < boxchars.size(); i += 2) { @@ -412,15 +390,16 @@ TEST_F(StringRendererTest, DoesRenderAllFontsToImage) { size_t offset = 0; std::string font_used; do { - Pix* pix = nullptr; + Pix *pix = nullptr; font_used.clear(); - offset += renderer_->RenderAllFontsToImage( - 1.0, kEngText + offset, strlen(kEngText + offset), &font_used, &pix); + offset += renderer_->RenderAllFontsToImage(1.0, kEngText + offset, strlen(kEngText + offset), + &font_used, &pix); if (offset < strlen(kEngText)) { EXPECT_TRUE(pix != nullptr); EXPECT_STRNE("", font_used.c_str()); } - if (FLAGS_display) pixDisplay(pix, 0, 0); + if (FLAGS_display) + pixDisplay(pix, 0, 0); pixDestroy(&pix); } while (offset < strlen(kEngText)); } @@ -429,10 +408,10 @@ TEST_F(StringRendererTest, DoesNotRenderWordJoiner) { renderer_.reset(new StringRenderer("Verdana 10", 500, 200)); const std::string word = "A- -B C-D A BC"; const std::string joined_word = StringRenderer::InsertWordJoiners(word); - Pix* pix = nullptr; + Pix *pix = nullptr; renderer_->RenderToImage(joined_word.c_str(), joined_word.length(), &pix); pixDestroy(&pix); - const std::vector& boxchars = renderer_->GetBoxes(); + const std::vector &boxchars = renderer_->GetBoxes(); const std::string kWordJoinerUTF8 = "\u2060"; ASSERT_EQ(word.length(), boxchars.size()); for (size_t i = 0; i < boxchars.size(); ++i) { @@ -446,13 +425,12 @@ TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) { renderer_->set_drop_uncovered_chars(true); const std::string kWord = "office"; const std::string kCleanWord = "oice"; - Pix* pix = nullptr; - EXPECT_FALSE( - renderer_->font().CanRenderString(kWord.c_str(), kWord.length())); + Pix *pix = nullptr; + EXPECT_FALSE(renderer_->font().CanRenderString(kWord.c_str(), kWord.length())); EXPECT_FALSE(renderer_->font().CoversUTF8Text(kWord.c_str(), kWord.length())); int offset = renderer_->RenderToImage(kWord.c_str(), kWord.length(), &pix); pixDestroy(&pix); - const std::vector& boxchars = renderer_->GetBoxes(); + const std::vector &boxchars = renderer_->GetBoxes(); EXPECT_EQ(kWord.length(), offset); ASSERT_EQ(kCleanWord.length(), boxchars.size()); for (size_t i = 0; i < boxchars.size(); ++i) { @@ -465,50 +443,40 @@ TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) { TEST(ConvertBasicLatinToFullwidthLatinTest, DoesConvertBasicLatin) { const std::string kHalfAlpha = "ABCD"; const std::string kFullAlpha = "ABCD"; - EXPECT_EQ(kFullAlpha, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha)); + EXPECT_EQ(kFullAlpha, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha)); const std::string kHalfDigit = "0123"; const std::string kFullDigit = "0123"; - EXPECT_EQ(kFullDigit, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit)); + EXPECT_EQ(kFullDigit, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit)); const std::string kHalfSym = "()[]:;!?"; const std::string kFullSym = "()[]:;!?"; - EXPECT_EQ(kFullSym, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym)); + EXPECT_EQ(kFullSym, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym)); } TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertFullwidthLatin) { const std::string kFullAlpha = "ABCD"; - EXPECT_EQ(kFullAlpha, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha)); + EXPECT_EQ(kFullAlpha, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha)); const std::string kFullDigit = "0123"; - EXPECT_EQ(kFullDigit, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit)); + EXPECT_EQ(kFullDigit, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit)); const std::string kFullSym = "()[]:;!?"; - EXPECT_EQ(kFullSym, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym)); + EXPECT_EQ(kFullSym, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym)); } TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertNonLatin) { const std::string kHalfKana = "アイウエオ"; const std::string kFullKana = "アイウエオ"; - EXPECT_EQ(kHalfKana, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana)); - EXPECT_EQ(kFullKana, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana)); + EXPECT_EQ(kHalfKana, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana)); + EXPECT_EQ(kFullKana, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana)); } TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) { const std::string kHalfSpace = " "; const std::string kFullSpace = " "; - EXPECT_EQ(kHalfSpace, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace)); - EXPECT_EQ(kFullSpace, - StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace)); + EXPECT_EQ(kHalfSpace, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace)); + EXPECT_EQ(kFullSpace, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace)); } // ------------ StringRenderer::ConvertFullwidthLatinToBasicLatin() ------------ @@ -516,49 +484,39 @@ TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) { TEST(ConvertFullwidthLatinToBasicLatinTest, DoesConvertFullwidthLatin) { const std::string kHalfAlpha = "ABCD"; const std::string kFullAlpha = "ABCD"; - EXPECT_EQ(kHalfAlpha, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha)); + EXPECT_EQ(kHalfAlpha, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha)); const std::string kHalfDigit = "0123"; const std::string kFullDigit = "0123"; - EXPECT_EQ(kHalfDigit, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit)); + EXPECT_EQ(kHalfDigit, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit)); const std::string kHalfSym = "()[]:;!?"; const std::string kFullSym = "()[]:;!?"; - EXPECT_EQ(kHalfSym, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym)); + EXPECT_EQ(kHalfSym, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym)); } TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertBasicLatin) { const std::string kHalfAlpha = "ABCD"; - EXPECT_EQ(kHalfAlpha, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha)); + EXPECT_EQ(kHalfAlpha, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha)); const std::string kHalfDigit = "0123"; - EXPECT_EQ(kHalfDigit, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit)); + EXPECT_EQ(kHalfDigit, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit)); const std::string kHalfSym = "()[]:;!?"; - EXPECT_EQ(kHalfSym, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym)); + EXPECT_EQ(kHalfSym, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym)); } TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertNonLatin) { const std::string kHalfKana = "アイウエオ"; const std::string kFullKana = "アイウエオ"; - EXPECT_EQ(kHalfKana, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana)); - EXPECT_EQ(kFullKana, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana)); + EXPECT_EQ(kHalfKana, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana)); + EXPECT_EQ(kFullKana, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana)); } TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertSpace) { const std::string kHalfSpace = " "; const std::string kFullSpace = " "; - EXPECT_EQ(kHalfSpace, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace)); - EXPECT_EQ(kFullSpace, - StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace)); + EXPECT_EQ(kHalfSpace, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace)); + EXPECT_EQ(kFullSpace, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace)); } -} // namespace +} // namespace tesseract diff --git a/unittest/syntaxnet/base.h b/unittest/syntaxnet/base.h index 5dabbbda..76fb0fbb 100644 --- a/unittest/syntaxnet/base.h +++ b/unittest/syntaxnet/base.h @@ -24,7 +24,6 @@ limitations under the License. #include "google/protobuf/util/message_differencer.h" - #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/lib/strings/stringprintf.h" @@ -32,30 +31,27 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/protobuf.h" - - -using tensorflow::int8; +using std::map; +using std::pair; +using std::unordered_map; +using std::unordered_set; +using std::vector; using tensorflow::int16; using tensorflow::int32; using tensorflow::int64; -using tensorflow::uint8; -using tensorflow::uint16; -using tensorflow::uint64; -using tensorflow::uint32; -using tensorflow::protobuf::TextFormat; -using tensorflow::mutex_lock; +using tensorflow::int8; using tensorflow::mutex; -using std::map; -using std::pair; -using std::vector; -using std::unordered_map; -using std::unordered_set; +using tensorflow::mutex_lock; +using tensorflow::uint16; +using tensorflow::uint32; +using tensorflow::uint64; +using tensorflow::uint8; +using tensorflow::protobuf::TextFormat; typedef signed int char32; -using tensorflow::StringPiece; using std::string; +using tensorflow::StringPiece; +// namespace syntaxnet - // namespace syntaxnet - -#endif // SYNTAXNET_BASE_H_ +#endif // SYNTAXNET_BASE_H_ diff --git a/unittest/tablefind_test.cc b/unittest/tablefind_test.cc index df6d511c..f51a4532 100644 --- a/unittest/tablefind_test.cc +++ b/unittest/tablefind_test.cc @@ -20,7 +20,7 @@ namespace tesseract { class TestableTableFinder : public tesseract::TableFinder { - public: +public: using TableFinder::GapInXProjection; using TableFinder::HasLeaderAdjacent; using TableFinder::InsertLeaderPartition; @@ -30,17 +30,16 @@ class TestableTableFinder : public tesseract::TableFinder { using TableFinder::set_global_median_xheight; using TableFinder::SplitAndInsertFragmentedTextPartition; - void ExpectPartition(const TBOX& box) { + void ExpectPartition(const TBOX &box) { tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_); gsearch.SetUniqueMode(true); gsearch.StartFullSearch(); - ColPartition* part = nullptr; + ColPartition *part = nullptr; bool found = false; while ((part = gsearch.NextFullSearch()) != nullptr) { if (part->bounding_box().left() == box.left() && part->bounding_box().bottom() == box.bottom() && - part->bounding_box().right() == box.right() && - part->bounding_box().top() == box.top()) { + part->bounding_box().right() == box.right() && part->bounding_box().top() == box.top()) { found = true; } } @@ -50,7 +49,7 @@ class TestableTableFinder : public tesseract::TableFinder { tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_); gsearch.SetUniqueMode(true); gsearch.StartFullSearch(); - ColPartition* part = nullptr; + ColPartition *part = nullptr; int count = 0; while ((part = gsearch.NextFullSearch()) != nullptr) { ++count; @@ -60,7 +59,7 @@ class TestableTableFinder : public tesseract::TableFinder { }; class TableFinderTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); free_boxes_it_.set_to_list(&free_boxes_); @@ -72,7 +71,8 @@ class TableFinderTest : public testing::Test { } void TearDown() { - if (partition_.get() != nullptr) partition_->DeleteBoxes(); + if (partition_.get() != nullptr) + partition_->DeleteBoxes(); DeletePartitionListBoxes(); finder_.reset(nullptr); } @@ -81,18 +81,18 @@ class TableFinderTest : public testing::Test { MakePartition(x_min, y_min, x_max, y_max, 0, 0); } - void MakePartition(int x_min, int y_min, int x_max, int y_max, - int first_column, int last_column) { - if (partition_.get() != nullptr) partition_->DeleteBoxes(); + void MakePartition(int x_min, int y_min, int x_max, int y_max, int first_column, + int last_column) { + if (partition_.get() != nullptr) + partition_->DeleteBoxes(); TBOX box; box.set_to_given_coords(x_min, y_min, x_max, y_max); - partition_.reset( - ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE)); + partition_.reset(ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE)); partition_->set_first_column(first_column); partition_->set_last_column(last_column); } - void InsertTextPartition(ColPartition* part) { + void InsertTextPartition(ColPartition *part) { finder_->InsertTextPartition(part); free_boxes_it_.add_after_then_move(part); } @@ -101,12 +101,12 @@ class TableFinderTest : public testing::Test { InsertLeaderPartition(x_min, y_min, x_max, y_max, 0, 0); } - void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max, - int first_column, int last_column) { + void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max, int first_column, + int last_column) { TBOX box; box.set_to_given_coords(x_min, y_min, x_max, y_max); - ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, - BRT_UNKNOWN, BTFT_LEADER); + ColPartition *part = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_UNKNOWN, BTFT_LEADER); part->set_first_column(first_column); part->set_last_column(last_column); finder_->InsertLeaderPartition(part); @@ -114,9 +114,8 @@ class TableFinderTest : public testing::Test { } void DeletePartitionListBoxes() { - for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list(); - free_boxes_it_.forward()) { - ColPartition* part = free_boxes_it_.data(); + for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list(); free_boxes_it_.forward()) { + ColPartition *part = free_boxes_it_.data(); part->DeleteBoxes(); } } @@ -124,30 +123,37 @@ class TableFinderTest : public testing::Test { std::unique_ptr finder_; std::unique_ptr partition_; - private: +private: tesseract::ColPartition_CLIST free_boxes_; tesseract::ColPartition_C_IT free_boxes_it_; }; TEST_F(TableFinderTest, GapInXProjectionNoGap) { int data[100]; - for (int i = 0; i < 100; ++i) data[i] = 10; + for (int i = 0; i < 100; ++i) + data[i] = 10; EXPECT_FALSE(finder_->GapInXProjection(data, 100)); } TEST_F(TableFinderTest, GapInXProjectionEdgeGap) { int data[100]; - for (int i = 0; i < 10; ++i) data[i] = 2; - for (int i = 10; i < 90; ++i) data[i] = 10; - for (int i = 90; i < 100; ++i) data[i] = 2; + for (int i = 0; i < 10; ++i) + data[i] = 2; + for (int i = 10; i < 90; ++i) + data[i] = 10; + for (int i = 90; i < 100; ++i) + data[i] = 2; EXPECT_FALSE(finder_->GapInXProjection(data, 100)); } TEST_F(TableFinderTest, GapInXProjectionExists) { int data[100]; - for (int i = 0; i < 10; ++i) data[i] = 10; - for (int i = 10; i < 90; ++i) data[i] = 2; - for (int i = 90; i < 100; ++i) data[i] = 10; + for (int i = 0; i < 10; ++i) + data[i] = 10; + for (int i = 10; i < 90; ++i) + data[i] = 2; + for (int i = 90; i < 100; ++i) + data[i] = 10; EXPECT_TRUE(finder_->GapInXProjection(data, 100)); } @@ -195,7 +201,7 @@ TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicPass) { finder_->set_global_median_xheight(10); TBOX part_box(10, 5, 100, 15); - ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); + ColPartition *all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); all->set_type(PT_FLOWING_TEXT); all->set_blob_type(BRT_TEXT); all->set_flow(BTFT_CHAIN); @@ -219,9 +225,9 @@ TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicPass) { } // TODO(nbeato): Ray's newer code... // all->ClaimBoxes(); - all->ComputeLimits(); // This is to make sure median iinfo is set. - InsertTextPartition(all); // This is to delete blobs - ColPartition* fragment_me = all->CopyButDontOwnBlobs(); + all->ComputeLimits(); // This is to make sure median iinfo is set. + InsertTextPartition(all); // This is to delete blobs + ColPartition *fragment_me = all->CopyButDontOwnBlobs(); finder_->SplitAndInsertFragmentedTextPartition(fragment_me); finder_->ExpectPartition(TBOX(11, 5, 24, 15)); @@ -235,7 +241,7 @@ TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) { finder_->set_global_median_xheight(10); TBOX part_box(10, 5, 100, 15); - ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); + ColPartition *all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); all->set_type(PT_FLOWING_TEXT); all->set_blob_type(BRT_TEXT); all->set_flow(BTFT_CHAIN); @@ -249,13 +255,13 @@ TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) { } // TODO(nbeato): Ray's newer code... // all->ClaimBoxes(); - all->ComputeLimits(); // This is to make sure median iinfo is set. - InsertTextPartition(all); // This is to delete blobs - ColPartition* fragment_me = all->CopyButDontOwnBlobs(); + all->ComputeLimits(); // This is to make sure median iinfo is set. + InsertTextPartition(all); // This is to delete blobs + ColPartition *fragment_me = all->CopyButDontOwnBlobs(); finder_->SplitAndInsertFragmentedTextPartition(fragment_me); finder_->ExpectPartition(TBOX(11, 5, 99, 15)); finder_->ExpectPartitionCount(1); } -} // namespace +} // namespace tesseract diff --git a/unittest/tablerecog_test.cc b/unittest/tablerecog_test.cc index 3dfb32c5..c4232b9c 100644 --- a/unittest/tablerecog_test.cc +++ b/unittest/tablerecog_test.cc @@ -20,7 +20,7 @@ namespace tesseract { class TestableTableRecognizer : public tesseract::TableRecognizer { - public: +public: using TableRecognizer::FindLinesBoundingBox; using TableRecognizer::HasSignificantLines; using TableRecognizer::RecognizeLinedTable; @@ -29,7 +29,7 @@ class TestableTableRecognizer : public tesseract::TableRecognizer { }; class TestableStructuredTable : public tesseract::StructuredTable { - public: +public: using StructuredTable::CountHorizontalIntersections; using StructuredTable::CountVerticalIntersections; using StructuredTable::FindLinedStructure; @@ -65,7 +65,7 @@ class TestableStructuredTable : public tesseract::StructuredTable { }; class SharedTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); ICOORD bleft(0, 0); @@ -89,8 +89,7 @@ class SharedTest : public testing::Test { void InsertPartition(int left, int bottom, int right, int top) { TBOX box(left, bottom, right, top); - ColPartition* part = - ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part->set_median_width(3); part->set_median_height(3); text_grid_->InsertBBox(true, true, part); @@ -100,30 +99,28 @@ class SharedTest : public testing::Test { } void InsertLines() { - line_box_.set_to_given_coords( - 100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(), - 450 + line_grid_->gridsize(), 50 + line_grid_->gridsize()); - for (int i = 10; i <= 50; i += 10) InsertHorizontalLine(100, 450, i); - for (int i = 100; i <= 450; i += 50) InsertVerticalLine(i, 10, 50); + line_box_.set_to_given_coords(100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(), + 450 + line_grid_->gridsize(), 50 + line_grid_->gridsize()); + for (int i = 10; i <= 50; i += 10) + InsertHorizontalLine(100, 450, i); + for (int i = 100; i <= 450; i += 50) + InsertVerticalLine(i, 10, 50); - for (int i = 100; i <= 200; i += 20) InsertHorizontalLine(0, 100, i); + for (int i = 100; i <= 200; i += 20) + InsertHorizontalLine(0, 100, i); } void InsertHorizontalLine(int left, int right, int y) { - TBOX box(left, y - line_grid_->gridsize(), right, - y + line_grid_->gridsize()); - ColPartition* part = - ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE); + TBOX box(left, y - line_grid_->gridsize(), right, y + line_grid_->gridsize()); + ColPartition *part = ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE); line_grid_->InsertBBox(true, true, part); tesseract::ColPartition_IT add_it(&allocated_parts_); add_it.add_after_stay_put(part); } void InsertVerticalLine(int x, int bottom, int top) { - TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(), - top); - ColPartition* part = - ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE); + TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(), top); + ColPartition *part = ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE); line_grid_->InsertBBox(true, true, part); tesseract::ColPartition_IT add_it(&allocated_parts_); @@ -143,7 +140,7 @@ class SharedTest : public testing::Test { }; class TableRecognizerTest : public SharedTest { - protected: +protected: void SetUp() { SharedTest::SetUp(); recognizer_.reset(new TestableTableRecognizer()); @@ -156,7 +153,7 @@ class TableRecognizerTest : public SharedTest { }; class StructuredTableTest : public SharedTest { - protected: +protected: void SetUp() { SharedTest::SetUp(); table_.reset(new TestableStructuredTable()); @@ -266,8 +263,10 @@ TEST_F(StructuredTableTest, CountHorizontalIntersectionsAll) { } TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) { - for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y); - for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x); + for (int y = 10; y <= 50; y += 10) + table_->InjectCellY(y); + for (int x = 100; x <= 450; x += 50) + table_->InjectCellX(x); InsertLines(); InsertCellsInLines(); table_->set_bounding_box(line_box_); @@ -275,8 +274,10 @@ TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) { } TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) { - for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y); - for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x); + for (int y = 10; y <= 50; y += 10) + table_->InjectCellY(y); + for (int x = 100; x <= 450; x += 50) + table_->InjectCellX(x); InsertLines(); InsertCellsInLines(); InsertPartition(101, 11, 299, 19); @@ -285,8 +286,10 @@ TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) { } TEST_F(StructuredTableTest, VerifyLinedTableVerticalFail) { - for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y); - for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x); + for (int y = 10; y <= 50; y += 10) + table_->InjectCellY(y); + for (int x = 100; x <= 450; x += 50) + table_->InjectCellX(x); InsertLines(); InsertCellsInLines(); InsertPartition(151, 21, 199, 39); @@ -313,4 +316,4 @@ TEST_F(StructuredTableTest, FindWhitespacedColumnsSorted) { // TODO(nbeato): check failure cases // TODO(nbeato): check Recognize processes correctly on trivial real examples. -} // namespace +} // namespace tesseract diff --git a/unittest/tabvector_test.cc b/unittest/tabvector_test.cc index dab0ace8..90fe05ad 100644 --- a/unittest/tabvector_test.cc +++ b/unittest/tabvector_test.cc @@ -18,7 +18,7 @@ namespace tesseract { class TabVectorTest : public testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); vector_.reset(); @@ -58,7 +58,7 @@ TEST_F(TabVectorTest, XAtY45DegreeSlopeInRangeExact) { } TEST_F(TabVectorTest, XAtYVerticalInRangeExact) { - const int x = 120; // Arbitrary choice + const int x = 120; // Arbitrary choice MakeSimpleTabVector(x, 0, x, 100); for (int y = 0; y <= 100; ++y) { int result_x = vector_->XAtY(y); @@ -67,7 +67,7 @@ TEST_F(TabVectorTest, XAtYVerticalInRangeExact) { } TEST_F(TabVectorTest, XAtYHorizontal) { - const int y = 76; // arbitrary + const int y = 76; // arbitrary MakeSimpleTabVector(0, y, 100, y); EXPECT_EQ(0, vector_->XAtY(y)); // TODO(nbeato): What's the failure condition? @@ -91,13 +91,13 @@ TEST_F(TabVectorTest, XAtYLargeNumbers) { // Assume a document is 800 DPI, // the width of a page is 10 inches across (8000 pixels), and // the height of the page is 15 inches (12000 pixels). - MakeSimpleTabVector(7804, 504, 7968, 11768); // Arbitrary for vertical line - int x = vector_->XAtY(6136); // test mid point + MakeSimpleTabVector(7804, 504, 7968, 11768); // Arbitrary for vertical line + int x = vector_->XAtY(6136); // test mid point EXPECT_EQ(7886, x); } TEST_F(TabVectorTest, XAtYHorizontalInRangeExact) { - const int y = 120; // Arbitrary choice + const int y = 120; // Arbitrary choice MakeSimpleTabVector(50, y, 150, y); int x = vector_->XAtY(y); @@ -127,4 +127,4 @@ TEST_F(TabVectorTest, XYFlip) { EXPECT_EQ(3, vector_->endpt().y()); } -} // namespace +} // namespace tesseract diff --git a/unittest/tatweel_test.cc b/unittest/tatweel_test.cc index 2dbc2a26..a412eb9d 100644 --- a/unittest/tatweel_test.cc +++ b/unittest/tatweel_test.cc @@ -10,23 +10,23 @@ // limitations under the License. #if defined(_WIN32) -#include // for _access +# include // for _access #else -#include // for access +# include // for access #endif -#include "include_gunit.h" #include "dawg.h" +#include "include_gunit.h" #include "trie.h" #include "unicharset.h" #ifdef INCLUDE_TENSORFLOW -#include "util/utf8/unicodetext.h" // for UnicodeText +# include "util/utf8/unicodetext.h" // for UnicodeText #endif namespace tesseract { // Replacement for std::filesystem::exists (C++-17) -static bool file_exists(const char* filename) { +static bool file_exists(const char *filename) { #if defined(_WIN32) return _access(filename, 0) == 0; #else @@ -35,7 +35,7 @@ static bool file_exists(const char* filename) { } class TatweelTest : public ::testing::Test { - protected: +protected: void SetUp() override { static std::locale system_locale(""); std::locale::global(system_locale); @@ -53,7 +53,8 @@ class TatweelTest : public ::testing::Test { int num_tatweel = 0; for (auto it = text.begin(); it != text.end(); ++it) { std::string utf8 = it.get_utf8_string(); - if (utf8.find(u8"\u0640") != std::string::npos) ++num_tatweel; + if (utf8.find(u8"\u0640") != std::string::npos) + ++num_tatweel; unicharset_.unichar_insert(utf8.c_str()); } LOG(INFO) << "Num tatweels in source data=" << num_tatweel; @@ -62,7 +63,7 @@ class TatweelTest : public ::testing::Test { #endif } - std::string TestDataNameToPath(const std::string& name) { + std::string TestDataNameToPath(const std::string &name) { return file::JoinPath(TESTDATA_DIR, name); } UNICHARSET unicharset_; @@ -71,23 +72,21 @@ class TatweelTest : public ::testing::Test { TEST_F(TatweelTest, UnicharsetIgnoresTatweel) { // This test verifies that the unicharset ignores the Tatweel character. for (int i = 0; i < unicharset_.size(); ++i) { - const char* utf8 = unicharset_.id_to_unichar(i); - EXPECT_EQ(strstr(utf8, reinterpret_cast(u8"\u0640")), nullptr); + const char *utf8 = unicharset_.id_to_unichar(i); + EXPECT_EQ(strstr(utf8, reinterpret_cast(u8"\u0640")), nullptr); } } TEST_F(TatweelTest, DictIgnoresTatweel) { // This test verifies that the dictionary ignores the Tatweel character. - tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, - unicharset_.size(), 0); + tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0); std::string filename = TestDataNameToPath("ara.wordlist"); if (!file_exists(filename.c_str())) { LOG(INFO) << "Skip test because of missing " << filename; GTEST_SKIP(); } else { - EXPECT_TRUE(trie.read_and_add_word_list( - filename.c_str(), unicharset_, - tesseract::Trie::RRP_REVERSE_IF_HAS_RTL)); + EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_, + tesseract::Trie::RRP_REVERSE_IF_HAS_RTL)); EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false)); } } @@ -103,12 +102,13 @@ TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) { EXPECT_TRUE(unicharset_.load_from_file(filename.c_str())); int num_tatweel = 0; for (int i = 0; i < unicharset_.size(); ++i) { - const char* utf8 = unicharset_.id_to_unichar(i); - if (strstr(utf8, reinterpret_cast(u8"\u0640")) != nullptr) ++num_tatweel; + const char *utf8 = unicharset_.id_to_unichar(i); + if (strstr(utf8, reinterpret_cast(u8"\u0640")) != nullptr) + ++num_tatweel; } LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel; EXPECT_EQ(num_tatweel, 4); } } -} // namespace +} // namespace tesseract diff --git a/unittest/textlineprojection_test.cc b/unittest/textlineprojection_test.cc index bed47b54..3712d2c5 100644 --- a/unittest/textlineprojection_test.cc +++ b/unittest/textlineprojection_test.cc @@ -10,16 +10,16 @@ // limitations under the License. #include -#include // for std::string +#include // for std::string -#include "absl/strings/str_format.h" // for absl::StrFormat +#include "absl/strings/str_format.h" // for absl::StrFormat #include "include_gunit.h" #include -#include "colfind.h" -#include "log.h" // for LOG -#include "mutableiterator.h" #include +#include "colfind.h" +#include "log.h" // for LOG +#include "mutableiterator.h" #include "pageres.h" #include "tesseractclass.h" #include "textlineprojection.h" @@ -32,8 +32,8 @@ const int kMinStrongTextValue = 6; // The fixture for testing Tesseract. class TextlineProjectionTest : public testing::Test { - protected: - std::string OutputNameToPath(const std::string& name) { +protected: + std::string OutputNameToPath(const std::string &name) { file::MakeTmpdir(); return file::JoinPath(FLAGS_test_tmpdir, name); } @@ -53,7 +53,7 @@ class TextlineProjectionTest : public testing::Test { delete tesseract_; } - void SetImage(const char* filename) { + void SetImage(const char *filename) { pixDestroy(&src_pix_); src_pix_ = pixRead(file::JoinPath(TESTING_DIR, filename).c_str()); api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY); @@ -70,16 +70,14 @@ class TextlineProjectionTest : public testing::Test { // the resultiterator from a separate BaseAPI run. void SetupProjection() { tesseract::TessdataManager mgr; - Tesseract* osd_tess = new Tesseract; + Tesseract *osd_tess = new Tesseract; OSResults osr; - EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, "", "osd", - tesseract::OEM_TESSERACT_ONLY, nullptr, 0, - nullptr, nullptr, false, &mgr), + EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, "", "osd", tesseract::OEM_TESSERACT_ONLY, + nullptr, 0, nullptr, nullptr, false, &mgr), 0); tesseract_ = new Tesseract; - EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, "", "eng", - tesseract::OEM_TESSERACT_ONLY, nullptr, 0, - nullptr, nullptr, false, &mgr), + EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, "", "eng", tesseract::OEM_TESSERACT_ONLY, + nullptr, 0, nullptr, nullptr, false, &mgr), 0); bin_pix_ = api_.GetThresholdedImage(); *tesseract_->mutable_pix_binary() = pixClone(bin_pix_); @@ -88,26 +86,25 @@ class TextlineProjectionTest : public testing::Test { int width = pixGetWidth(bin_pix_); int height = pixGetHeight(bin_pix_); // First make a single block covering the whole image. - BLOCK* block = new BLOCK("", true, 0, 0, 0, 0, width, height); + BLOCK *block = new BLOCK("", true, 0, 0, 0, 0, width, height); block->set_right_to_left(false); BLOCK_LIST src_blocks; BLOCK_IT block_it(&src_blocks); block_it.add_to_end(block); - Pix* photomask_pix = nullptr; + Pix *photomask_pix = nullptr; // The blocks made by the ColumnFinder. Moved to blocks before return. BLOCK_LIST found_blocks; TO_BLOCK_LIST temp_blocks; - finder_ = tesseract_->SetupPageSegAndDetectOrientation( - tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess, &osr, &temp_blocks, - &photomask_pix, nullptr); + finder_ = + tesseract_->SetupPageSegAndDetectOrientation(tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess, + &osr, &temp_blocks, &photomask_pix, nullptr); TO_BLOCK_IT to_block_it(&temp_blocks); - TO_BLOCK* to_block = to_block_it.data(); + TO_BLOCK *to_block = to_block_it.data(); denorm_ = finder_->denorm(); TO_BLOCK_LIST to_blocks; BLOBNBOX_LIST diacritic_blobs; - EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block, - photomask_pix, nullptr, nullptr, nullptr, - &found_blocks, &diacritic_blobs, &to_blocks), + EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block, photomask_pix, nullptr, + nullptr, nullptr, &found_blocks, &diacritic_blobs, &to_blocks), 0); projection_ = finder_->projection(); pixDestroy(&photomask_pix); @@ -116,19 +113,17 @@ class TextlineProjectionTest : public testing::Test { // Helper evaluates the given box, expects the result to be greater_than // or !greater_than the target_value and provides diagnostics if not. - void EvaluateBox(const TBOX& box, bool greater_or_equal, int target_value, - const char* text, const char* message) { + void EvaluateBox(const TBOX &box, bool greater_or_equal, int target_value, const char *text, + const char *message) { int value = projection_->EvaluateBox(box, denorm_, false); if (greater_or_equal != (value > target_value)) { LOG(INFO) << absl::StrFormat( - "EvaluateBox too %s:%d vs %d for %s word '%s' at:", - greater_or_equal ? "low" : "high", value, target_value, message, - text); + "EvaluateBox too %s:%d vs %d for %s word '%s' at:", greater_or_equal ? "low" : "high", + value, target_value, message, text); box.print(); value = projection_->EvaluateBox(box, denorm_, true); } else { - LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'", - value, message, text); + LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'", value, message, text); } if (greater_or_equal) { EXPECT_GE(value, target_value); @@ -139,37 +134,33 @@ class TextlineProjectionTest : public testing::Test { // Helper evaluates the DistanceOfBoxFromBox function by expecting that // box should be nearer to true_box than false_box. - void EvaluateDistance(const TBOX& box, const TBOX& true_box, - const TBOX& false_box, const char* text, - const char* message) { - int true_dist = - projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false); - int false_dist = - projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false); + void EvaluateDistance(const TBOX &box, const TBOX &true_box, const TBOX &false_box, + const char *text, const char *message) { + int true_dist = projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false); + int false_dist = projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false); if (false_dist <= true_dist) { - LOG(INFO) << absl::StrFormat( - "Distance wrong:%d vs %d for %s word '%s' at:", - false_dist, true_dist, message, text); + LOG(INFO) << absl::StrFormat("Distance wrong:%d vs %d for %s word '%s' at:", false_dist, + true_dist, message, text); true_box.print(); projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, true); projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, true); } else { - LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'", - false_dist, true_dist, message, text); + LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'", false_dist, true_dist, + message, text); } } // Tests the projection on the word boxes of the given image. // line_height is the cap + descender size of the text. - void VerifyBoxes(const char* imagefile, int line_height) { + void VerifyBoxes(const char *imagefile, int line_height) { SetImage(imagefile); api_.Recognize(nullptr); SetupProjection(); - MutableIterator* it = api_.GetMutableIterator(); + MutableIterator *it = api_.GetMutableIterator(); do { - char* text = it->GetUTF8Text(tesseract::RIL_WORD); - const PAGE_RES_IT* pr_it = it->PageResIt(); - WERD_RES* word = pr_it->word(); + char *text = it->GetUTF8Text(tesseract::RIL_WORD); + const PAGE_RES_IT *pr_it = it->PageResIt(); + WERD_RES *word = pr_it->word(); // The word_box refers to the internal, possibly rotated, coords. TBOX word_box = word->word->bounding_box(); bool small_word = word_box.height() * 1.5 < line_height; @@ -196,7 +187,8 @@ class TextlineProjectionTest : public testing::Test { TBOX lower_box = word_box; lower_box.set_top(word_box.bottom()); lower_box.set_bottom(word_box.bottom() - padding); - if (tall_word) lower_box.move(ICOORD(0, padding / 2)); + if (tall_word) + lower_box.move(ICOORD(0, padding / 2)); EvaluateBox(lower_box, false, kMinStrongTextValue, text, "Lower Word"); EvaluateBox(lower_box, true, -1, text, "Lower Word not vertical"); @@ -225,38 +217,41 @@ class TextlineProjectionTest : public testing::Test { TBOX upper_challenger(upper_box); upper_challenger.set_bottom(upper_box.top()); upper_challenger.set_top(upper_box.top() + word_box.height()); - EvaluateDistance(upper_box, target_box, upper_challenger, text, - "Upper Word"); - if (tall_word) lower_box.move(ICOORD(0, padding / 2)); + EvaluateDistance(upper_box, target_box, upper_challenger, text, "Upper Word"); + if (tall_word) + lower_box.move(ICOORD(0, padding / 2)); lower_box.set_bottom(lower_box.top() - padding); target_box = word_box; target_box.set_bottom(lower_box.top()); TBOX lower_challenger(lower_box); lower_challenger.set_top(lower_box.bottom()); lower_challenger.set_bottom(lower_box.bottom() - word_box.height()); - EvaluateDistance(lower_box, target_box, lower_challenger, text, - "Lower Word"); + EvaluateDistance(lower_box, target_box, lower_challenger, text, "Lower Word"); delete[] text; } while (it->Next(tesseract::RIL_WORD)); delete it; } - Pix* src_pix_; - Pix* bin_pix_; + Pix *src_pix_; + Pix *bin_pix_; BLOCK_LIST blocks_; std::string ocr_text_; tesseract::TessBaseAPI api_; - Tesseract* tesseract_; - ColumnFinder* finder_; - const DENORM* denorm_; - const TextlineProjection* projection_; + Tesseract *tesseract_; + ColumnFinder *finder_; + const DENORM *denorm_; + const TextlineProjection *projection_; }; // Tests all word boxes on an unrotated image. -TEST_F(TextlineProjectionTest, Unrotated) { VerifyBoxes("phototest.tif", 31); } +TEST_F(TextlineProjectionTest, Unrotated) { + VerifyBoxes("phototest.tif", 31); +} // Tests character-level applyboxes on italic Times New Roman. -TEST_F(TextlineProjectionTest, Rotated) { VerifyBoxes("phototestrot.tif", 31); } +TEST_F(TextlineProjectionTest, Rotated) { + VerifyBoxes("phototestrot.tif", 31); +} -} // namespace +} // namespace tesseract diff --git a/unittest/tfile_test.cc b/unittest/tfile_test.cc index 166405ff..fcb27e06 100644 --- a/unittest/tfile_test.cc +++ b/unittest/tfile_test.cc @@ -20,7 +20,7 @@ namespace tesseract { // writing/reading. class TfileTest : public ::testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); } @@ -29,16 +29,18 @@ class TfileTest : public ::testing::Test { // Some data to serialize. class MathData { - public: + public: MathData() : num_squares_(0), num_triangles_(0) {} void Setup() { // Setup some data. - for (int s = 0; s < 42; ++s) squares_.push_back(s * s); + for (int s = 0; s < 42; ++s) + squares_.push_back(s * s); num_squares_ = squares_.size(); - for (int t = 0; t < 52; ++t) triangles_.push_back(t * (t + 1) / 2); + for (int t = 0; t < 52; ++t) + triangles_.push_back(t * (t + 1) / 2); num_triangles_ = triangles_.size(); } - void ExpectEq(const MathData& other) { + void ExpectEq(const MathData &other) { // Check the data. EXPECT_EQ(num_squares_, other.num_squares_); for (int s = 0; s < squares_.size(); ++s) @@ -47,32 +49,40 @@ class TfileTest : public ::testing::Test { for (int s = 0; s < triangles_.size(); ++s) EXPECT_EQ(triangles_[s], other.triangles_[s]); } - bool Serialize(TFile* fp) { - if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false; - if (!squares_.Serialize(fp)) return false; + bool Serialize(TFile *fp) { + if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) + return false; + if (!squares_.Serialize(fp)) + return false; if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) return false; - if (!triangles_.Serialize(fp)) return false; + if (!triangles_.Serialize(fp)) + return false; return true; } - bool DeSerialize(TFile* fp) { + bool DeSerialize(TFile *fp) { if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1) return false; - if (!squares_.DeSerialize(fp)) return false; + if (!squares_.DeSerialize(fp)) + return false; if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1) return false; - if (!triangles_.DeSerialize(fp)) return false; + if (!triangles_.DeSerialize(fp)) + return false; return true; } - bool SerializeBigEndian(TFile* fp) { + bool SerializeBigEndian(TFile *fp) { ReverseN(&num_squares_, sizeof(num_squares_)); - if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false; + if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) + return false; // Write an additional reversed size before the vector, which will get // used as its size on reading. - if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false; + if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) + return false; for (int i = 0; i < squares_.size(); ++i) ReverseN(&squares_[i], sizeof(squares_[i])); - if (!squares_.Serialize(fp)) return false; + if (!squares_.Serialize(fp)) + return false; ReverseN(&num_triangles_, sizeof(num_triangles_)); if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) return false; @@ -82,10 +92,11 @@ class TfileTest : public ::testing::Test { ReverseN(&triangles_[i], sizeof(triangles_[i])); return triangles_.Serialize(fp); } - bool DeSerializeBigEndian(TFile* fp) { + bool DeSerializeBigEndian(TFile *fp) { if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1) return false; - if (!squares_.DeSerialize(fp)) return false; + if (!squares_.DeSerialize(fp)) + return false; // The first element is the size that was written, so we will delete it // and read the last element separately. int last_element; @@ -95,7 +106,8 @@ class TfileTest : public ::testing::Test { squares_.push_back(last_element); if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1) return false; - if (!triangles_.DeSerialize(fp)) return false; + if (!triangles_.DeSerialize(fp)) + return false; if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1) return false; triangles_.remove(0); @@ -103,7 +115,7 @@ class TfileTest : public ::testing::Test { return true; } - private: + private: GenericVector squares_; int num_squares_; GenericVector triangles_; @@ -176,4 +188,4 @@ TEST_F(TfileTest, BigEndian) { m3.ExpectEq(m2); } -} // namespace +} // namespace tesseract diff --git a/unittest/third_party/utf/rune.c b/unittest/third_party/utf/rune.c index af1bafbf..4b4f0697 100644 --- a/unittest/third_party/utf/rune.c +++ b/unittest/third_party/utf/rune.c @@ -16,32 +16,31 @@ #include "third_party/utf/utf.h" #include "third_party/utf/utfdef.h" -enum -{ - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, - Bit5 = 2, +enum { + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ - T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ - Rune4 = (1<<(Bit4+3*Bitx))-1, - /* 0001 1111 1111 1111 1111 1111 */ + Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0111 1111 */ + Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0111 1111 1111 */ + Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 1111 1111 1111 1111 */ + Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1, + /* 0001 1111 1111 1111 1111 1111 */ - Maskx = (1< T1 - */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; - return 1; - } + /* + * one character sequence (7-bit value) + * 00000-0007F => T1 + */ + c = *(uchar *)str; + if (c < Tx) { + *rune = c; + return 1; + } - // If we can't read more than one character we must stop - if(length <= 1) { - goto badlen; - } + // If we can't read more than one character we must stop + if (length <= 1) { + goto badlen; + } - /* - * two character sequence (11-bit value) - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } + /* + * two character sequence (11-bit value) + * 0080-07FF => T2 Tx + */ + c1 = *(uchar *)(str + 1) ^ Tx; + if (c1 & Testx) + goto bad; + if (c < T3) { + if (c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if (l <= Rune1) + goto bad; + *rune = l; + return 2; + } - // If we can't read more than two characters we must stop - if(length <= 2) { - goto badlen; - } + // If we can't read more than two characters we must stop + if (length <= 2) { + goto badlen; + } - /* - * three character sequence (16-bit value) - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; - } + /* + * three character sequence (16-bit value) + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar *)(str + 2) ^ Tx; + if (c2 & Testx) + goto bad; + if (c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if (l <= Rune2) + goto bad; + *rune = l; + return 3; + } - if (length <= 3) - goto badlen; + if (length <= 3) + goto badlen; - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - c3 = *(uchar*)(str+3) ^ Tx; - if (c3 & Testx) - goto bad; - if (c < T5) { - l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if (l <= Rune3) - goto bad; - if (l > Runemax) - goto bad; - *rune = l; - return 4; - } + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar *)(str + 3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + if (l > Runemax) + goto bad; + *rune = l; + return 4; + } - // Support for 5-byte or longer UTF-8 would go here, but - // since we don't have that, we'll just fall through to bad. + // Support for 5-byte or longer UTF-8 would go here, but + // since we don't have that, we'll just fall through to bad. - /* - * bad decoding - */ + /* + * bad decoding + */ bad: - *rune = Bad; - return 1; + *rune = Bad; + return 1; badlen: - *rune = Bad; - return 0; - + *rune = Bad; + return 0; } - /* * This is the older "unsafe" version, which works fine on * null-terminated strings. */ -int -chartorune(Rune *rune, const char *str) -{ - int c, c1, c2, c3; - long l; +int chartorune(Rune *rune, const char *str) { + int c, c1, c2, c3; + long l; - /* - * one character sequence - * 00000-0007F => T1 - */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; - return 1; - } + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(uchar *)str; + if (c < Tx) { + *rune = c; + return 1; + } - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(uchar *)(str + 1) ^ Tx; + if (c1 & Testx) + goto bad; + if (c < T3) { + if (c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if (l <= Rune1) + goto bad; + *rune = l; + return 2; + } - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; - } + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar *)(str + 2) ^ Tx; + if (c2 & Testx) + goto bad; + if (c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if (l <= Rune2) + goto bad; + *rune = l; + return 3; + } - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - c3 = *(uchar*)(str+3) ^ Tx; - if (c3 & Testx) - goto bad; - if (c < T5) { - l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if (l <= Rune3) - goto bad; - if (l > Runemax) - goto bad; - *rune = l; - return 4; - } + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar *)(str + 3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + if (l > Runemax) + goto bad; + *rune = l; + return 4; + } - /* - * Support for 5-byte or longer UTF-8 would go here, but - * since we don't have that, we'll just fall through to bad. - */ + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ - /* - * bad decoding - */ + /* + * bad decoding + */ bad: - *rune = Bad; - return 1; + *rune = Bad; + return 1; } -int -isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { - *consumed = charntorune(rune, str, length); - return *rune != Runeerror || *consumed == 3; +int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed) { + *consumed = charntorune(rune, str, length); + return *rune != Runeerror || *consumed == 3; } -int -runetochar(char *str, const Rune *rune) -{ - /* Runes are signed, so convert to unsigned for range check. */ - unsigned long c; +int runetochar(char *str, const Rune *rune) { + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; - /* - * one character sequence - * 00000-0007F => 00-7F - */ - c = *rune; - if(c <= Rune1) { - str[0] = c; - return 1; - } + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if (c <= Rune1) { + str[0] = c; + return 1; + } - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if (c <= Rune2) { + str[0] = T2 | (c >> 1 * Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } - /* - * If the Rune is out of range, convert it to the error rune. - * Do this test here because the error rune encodes to three bytes. - * Doing it earlier would duplicate work, since an out of range - * Rune wouldn't have fit in one or two bytes. - */ - if (c > Runemax) - c = Runeerror; + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; - } + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | (c >> 2 * Bitx); + str[1] = Tx | ((c >> 1 * Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - str[0] = T4 | (c >> 3*Bitx); - str[1] = Tx | ((c >> 2*Bitx) & Maskx); - str[2] = Tx | ((c >> 1*Bitx) & Maskx); - str[3] = Tx | (c & Maskx); - return 4; + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3 * Bitx); + str[1] = Tx | ((c >> 2 * Bitx) & Maskx); + str[2] = Tx | ((c >> 1 * Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } -int -runelen(Rune rune) -{ - char str[10]; +int runelen(Rune rune) { + char str[10]; - return runetochar(str, &rune); + return runetochar(str, &rune); } -int -runenlen(const Rune *r, int nrune) -{ - int nb; - ulong c; /* Rune is signed, so use unsigned for range check. */ +int runenlen(const Rune *r, int nrune) { + int nb; + ulong c; /* Rune is signed, so use unsigned for range check. */ - nb = 0; - while(nrune--) { - c = *r++; - if (c <= Rune1) - nb++; - else if (c <= Rune2) - nb += 2; - else if (c <= Rune3) - nb += 3; - else if (c <= Runemax) - nb += 4; - else - nb += 3; /* Runeerror = 0xFFFD, see runetochar */ - } - return nb; + nb = 0; + while (nrune--) { + c = *r++; + if (c <= Rune1) + nb++; + else if (c <= Rune2) + nb += 2; + else if (c <= Rune3) + nb += 3; + else if (c <= Runemax) + nb += 4; + else + nb += 3; /* Runeerror = 0xFFFD, see runetochar */ + } + return nb; } -int -fullrune(const char *str, int n) -{ - if (n > 0) { - int c = *(uchar*)str; - if (c < Tx) - return 1; - if (n > 1) { - if (c < T3) - return 1; - if (n > 2) { - if (c < T4 || n > 3) - return 1; - } - } - } - return 0; +int fullrune(const char *str, int n) { + if (n > 0) { + int c = *(uchar *)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; } diff --git a/unittest/third_party/utf/utf.h b/unittest/third_party/utf/utf.h index 7d8cf547..bae7834f 100644 --- a/unittest/third_party/utf/utf.h +++ b/unittest/third_party/utf/utf.h @@ -16,18 +16,17 @@ #include -typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ -enum -{ - UTFmax = 4, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ - Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0xFFFD, /* decoding error in UTF */ - Runemax = 0x10FFFF, /* maximum rune value */ +enum { + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ }; -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif @@ -41,13 +40,12 @@ extern "C" { * SEE ALSO * utf (7) * tcs (1) -*/ + */ // runetochar copies (encodes) one rune, pointed to by r, to at most // UTFmax bytes starting at s and returns the number of bytes generated. -int runetochar(char* s, const Rune* r); - +int runetochar(char *s, const Rune *r); // chartorune copies (decodes) at most UTFmax bytes starting at s to // one rune, pointed to by r, and returns the number of bytes consumed. @@ -59,8 +57,7 @@ int runetochar(char* s, const Rune* r); // Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal // anywhere else in a UTF sequence. -int chartorune(Rune* r, const char* s); - +int chartorune(Rune *r, const char *s); // charntorune is like chartorune, except that it will access at most // n bytes of s. If the UTF sequence is incomplete within n bytes, @@ -69,25 +66,23 @@ int chartorune(Rune* r, const char* s); // // Added 2004-09-24 by Wei-Hwa Huang -int charntorune(Rune* r, const char* s, int n); +int charntorune(Rune *r, const char *s, int n); // isvalidcharntorune(str, n, r, consumed) // is a convenience function that calls "*consumed = charntorune(r, str, n)" // and returns an int (logically boolean) indicating whether the first // n bytes of str was a valid and complete UTF sequence. -int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed); +int isvalidcharntorune(const char *str, int n, Rune *r, int *consumed); // runelen returns the number of bytes required to convert r into UTF. int runelen(Rune r); - // runenlen returns the number of bytes required to convert the n // runes pointed to by r into UTF. -int runenlen(const Rune* r, int n); - +int runenlen(const Rune *r, int n); // fullrune returns 1 if the string s of length n is long enough to be // decoded by chartorune, and 0 otherwise. This does not guarantee @@ -95,7 +90,7 @@ int runenlen(const Rune* r, int n); // by programs that obtain input one byte at a time and need to know // when a full rune has arrived. -int fullrune(const char* s, int n); +int fullrune(const char *s, int n); // The following routines are analogous to the corresponding string // routines with "utf" substituted for "str", and "rune" substituted @@ -104,8 +99,7 @@ int fullrune(const char* s, int n); // utflen returns the number of runes that are represented by the UTF // string s. (cf. strlen) -int utflen(const char* s); - +int utflen(const char *s); // utfnlen returns the number of complete runes that are represented // by the first n bytes of the UTF string s. If the last few bytes of @@ -113,40 +107,34 @@ int utflen(const char* s); // count them; in this way, it differs from utflen, which includes // every byte of the string. (cf. strnlen) -int utfnlen(const char* s, long n); - +int utfnlen(const char *s, long n); // utfrune returns a pointer to the first occurrence of rune r in the // UTF string s, or 0 if r does not occur in the string. The NULL // byte terminating a string is considered to be part of the string s. // (cf. strchr) -const char* utfrune(const char* s, Rune r); - +const char *utfrune(const char *s, Rune r); // utfrrune returns a pointer to the last occurrence of rune r in the // UTF string s, or 0 if r does not occur in the string. The NULL // byte terminating a string is considered to be part of the string s. // (cf. strrchr) -const char* utfrrune(const char* s, Rune r); - +const char *utfrrune(const char *s, Rune r); // utfutf returns a pointer to the first occurrence of the UTF string // s2 as a UTF substring of s1, or 0 if there is none. If s2 is the // null string, utfutf returns s1. (cf. strstr) -const char* utfutf(const char* s1, const char* s2); - +const char *utfutf(const char *s1, const char *s2); // utfecpy copies UTF sequences until a null sequence has been copied, // but writes no sequences beyond es1. If any sequences are copied, // s1 is terminated by a null sequence, and a pointer to that sequence // is returned. Otherwise, the original s1 is returned. (cf. strecpy) -char* utfecpy(char *s1, char *es1, const char *s2); - - +char *utfecpy(char *s1, char *es1, const char *s2); // These functions are rune-string analogues of the corresponding // functions in strcat (3). @@ -159,25 +147,23 @@ char* utfecpy(char *s1, char *es1, const char *s2); // // BUGS: The outcome of overlapping moves varies among implementations. -Rune* runestrcat(Rune* s1, const Rune* s2); -Rune* runestrncat(Rune* s1, const Rune* s2, long n); +Rune *runestrcat(Rune *s1, const Rune *s2); +Rune *runestrncat(Rune *s1, const Rune *s2, long n); -const Rune* runestrchr(const Rune* s, Rune c); +const Rune *runestrchr(const Rune *s, Rune c); -int runestrcmp(const Rune* s1, const Rune* s2); -int runestrncmp(const Rune* s1, const Rune* s2, long n); +int runestrcmp(const Rune *s1, const Rune *s2); +int runestrncmp(const Rune *s1, const Rune *s2, long n); -Rune* runestrcpy(Rune* s1, const Rune* s2); -Rune* runestrncpy(Rune* s1, const Rune* s2, long n); -Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2); - -Rune* runestrdup(const Rune* s); - -const Rune* runestrrchr(const Rune* s, Rune c); -long runestrlen(const Rune* s); -const Rune* runestrstr(const Rune* s1, const Rune* s2); +Rune *runestrcpy(Rune *s1, const Rune *s2); +Rune *runestrncpy(Rune *s1, const Rune *s2, long n); +Rune *runestrecpy(Rune *s1, Rune *es1, const Rune *s2); +Rune *runestrdup(const Rune *s); +const Rune *runestrrchr(const Rune *s, Rune c); +long runestrlen(const Rune *s); +const Rune *runestrstr(const Rune *s1, const Rune *s2); // The following routines test types and modify cases for Unicode // characters. Unicode defines some characters as letters and @@ -200,7 +186,6 @@ Rune toupperrune(Rune r); Rune tolowerrune(Rune r); Rune totitlerune(Rune r); - // isupperrune tests for upper case characters, including Unicode // upper case letters and targets of the toupper mapping. islowerrune // and istitlerune are defined analogously. @@ -209,37 +194,32 @@ int isupperrune(Rune r); int islowerrune(Rune r); int istitlerune(Rune r); - // isalpharune tests for Unicode letters; this includes ideographs in // addition to alphabetic characters. int isalpharune(Rune r); - // isdigitrune tests for digits. Non-digit numbers, such as Roman // numerals, are not included. int isdigitrune(Rune r); - // isideographicrune tests for ideographic characters and numbers, as // defined by the Unicode standard. int isideographicrune(Rune r); - // isspacerune tests for whitespace characters, including "C" locale // whitespace, Unicode defined whitespace, and the "zero-width // non-break space" character. int isspacerune(Rune r); - // (The comments in this file were copied from the manpage files rune.3, // isalpharune.3, and runestrcat.3. Some formatting changes were also made // to conform to Google style. /JRM 11/11/05) -#ifdef __cplusplus +#ifdef __cplusplus } #endif diff --git a/unittest/third_party/utf/utfdef.h b/unittest/third_party/utf/utfdef.h index 4b58ae87..deaf396b 100644 --- a/unittest/third_party/utf/utfdef.h +++ b/unittest/third_party/utf/utfdef.h @@ -5,10 +5,10 @@ #define vlong _utfvlong #define uvlong _utfuvlong -typedef unsigned char uchar; -typedef unsigned short ushort; -typedef unsigned int uint; -typedef unsigned long ulong; +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; -#define nelem(x) (sizeof(x)/sizeof((x)[0])) -#define nil ((void*)0) +#define nelem(x) (sizeof(x) / sizeof((x)[0])) +#define nil ((void *)0) diff --git a/unittest/unichar_test.cc b/unittest/unichar_test.cc index 54394436..e03dad17 100644 --- a/unittest/unichar_test.cc +++ b/unittest/unichar_test.cc @@ -9,9 +9,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "include_gunit.h" -#include "gmock/gmock.h" // for testing::ElementsAreArray #include +#include "gmock/gmock.h" // for testing::ElementsAreArray +#include "include_gunit.h" namespace tesseract { @@ -19,7 +19,7 @@ TEST(UnicharTest, Conversion) { // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8 // show the required conversion properties. // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes. - const char* kUTF8Src = "a\u05d0\u0ca4\U0002a714"; + const char *kUTF8Src = "a\u05d0\u0ca4\U0002a714"; const std::vector kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714}; // Check for round-trip conversion. std::vector utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src); @@ -30,7 +30,7 @@ TEST(UnicharTest, Conversion) { TEST(UnicharTest, InvalidText) { // This test verifies that Unichar correctly deals with invalid text. - const char* kInvalidUTF8 = "a b\200d string"; + const char *kInvalidUTF8 = "a b\200d string"; const std::vector kInvalidUTF32 = {'a', ' ', 0x200000, 'x'}; // Invalid utf8 produces an empty vector. std::vector utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8); @@ -40,4 +40,4 @@ TEST(UnicharTest, InvalidText) { EXPECT_TRUE(utf8.empty()); } -} // namespace +} // namespace tesseract diff --git a/unittest/unicharcompress_test.cc b/unittest/unicharcompress_test.cc index 9c7662ae..ca731b80 100644 --- a/unittest/unicharcompress_test.cc +++ b/unittest/unicharcompress_test.cc @@ -11,13 +11,13 @@ #include +#include #include "absl/strings/ascii.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_split.h" -#include #include "include_gunit.h" -#include "log.h" // for LOG +#include "log.h" // for LOG #include "serialis.h" #include "tprintf.h" #include "unicharcompress.h" @@ -25,32 +25,28 @@ namespace tesseract { class UnicharcompressTest : public ::testing::Test { - protected: +protected: void SetUp() { std::locale::global(std::locale("")); file::MakeTmpdir(); } // Loads and compresses the given unicharset. - void LoadUnicharset(const std::string& unicharset_name) { - std::string radical_stroke_file = - file::JoinPath(LANGDATA_DIR, "radical-stroke.txt"); - std::string unicharset_file = - file::JoinPath(TESTDATA_DIR, unicharset_name); + void LoadUnicharset(const std::string &unicharset_name) { + std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt"); + std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name); std::string radical_data; - CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, - file::Defaults())); + CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults())); CHECK(unicharset_.load_from_file(unicharset_file.c_str())); STRING radical_str(radical_data.c_str()); - null_char_ = - unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size(); + null_char_ = unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size(); compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str); // Get the encoding of the null char. RecodedCharID code; compressed_.EncodeUnichar(null_char_, &code); encoded_null_char_ = code(0); - std::string output_name = file::JoinPath( - FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt")); + std::string output_name = + file::JoinPath(FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt")); STRING encoding = compressed_.GetEncodingAsString(unicharset_); std::string encoding_str(&encoding[0], encoding.size()); CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults())); @@ -67,32 +63,31 @@ class UnicharcompressTest : public ::testing::Test { EXPECT_TRUE(compressed_.DeSerialize(&rfp)); } // Returns true if the lang is in CJK. - bool IsCJKLang(const std::string& lang) { - return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || - lang == "jpn"; + bool IsCJKLang(const std::string &lang) { + return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn"; } // Returns true if the lang is Indic. - bool IsIndicLang(const std::string& lang) { - return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || - lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" || - lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" || - lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" || - lang == "tel"; + bool IsIndicLang(const std::string &lang) { + return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" || + lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" || + lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" || + lang == "tam" || lang == "tel"; } // Expects the appropriate results from the compressed_ unicharset_. - void ExpectCorrect(const std::string& lang) { + void ExpectCorrect(const std::string &lang) { // Count the number of times each code is used in each element of // RecodedCharID. RecodedCharID zeros; - for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0); + for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) + zeros.Set(i, 0); int code_range = compressed_.code_range(); - std::vector times_seen(code_range, zeros); + std::vector times_seen(code_range, zeros); for (int u = 0; u <= unicharset_.size(); ++u) { if (u != UNICHAR_SPACE && u != null_char_ && - (u == unicharset_.size() || (unicharset_.has_special_codes() && - u < SPECIAL_UNICHAR_CODES_COUNT))) { - continue; // Not used so not encoded. + (u == unicharset_.size() || + (unicharset_.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT))) { + continue; // Not used so not encoded. } RecodedCharID code; int len = compressed_.EncodeUnichar(u, &code); @@ -117,7 +112,8 @@ class UnicharcompressTest : public ::testing::Test { for (int c = 0; c < code_range; ++c) { int num_used = 0; for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) { - if (times_seen[c](i) != 0) ++num_used; + if (times_seen[c](i) != 0) + ++num_used; } EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range; } @@ -133,16 +129,15 @@ class UnicharcompressTest : public ::testing::Test { } else { EXPECT_LE(code_range, unicharset_.size() + 1); } - LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " - << code_range; + LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range; } // Checks for extensions of the current code that either finish a code, or // extend it and checks those extensions recursively. - void CheckCodeExtensions(const RecodedCharID& code, - const std::vector& times_seen) { + void CheckCodeExtensions(const RecodedCharID &code, + const std::vector ×_seen) { RecodedCharID extended = code; int length = code.length(); - const GenericVector* final_codes = compressed_.GetFinalCodes(code); + const GenericVector *final_codes = compressed_.GetFinalCodes(code); if (final_codes != nullptr) { for (int i = 0; i < final_codes->size(); ++i) { int ending = (*final_codes)[i]; @@ -152,7 +147,7 @@ class UnicharcompressTest : public ::testing::Test { EXPECT_NE(INVALID_UNICHAR_ID, unichar_id); } } - const GenericVector* next_codes = compressed_.GetNextCodes(code); + const GenericVector *next_codes = compressed_.GetNextCodes(code); if (next_codes != nullptr) { for (int i = 0; i < next_codes->size(); ++i) { int extension = (*next_codes)[i]; @@ -238,8 +233,7 @@ TEST_F(UnicharcompressTest, GetEncodingAsString) { ExpectCorrect("trivial"); STRING encoding = compressed_.GetEncodingAsString(unicharset_); std::string encoding_str(&encoding[0], encoding.length()); - std::vector lines = - absl::StrSplit(encoding_str, "\n", absl::SkipEmpty()); + std::vector lines = absl::StrSplit(encoding_str, "\n", absl::SkipEmpty()); EXPECT_EQ(5, lines.size()); // The first line is always space. EXPECT_EQ("0\t ", lines[0]); @@ -254,4 +248,4 @@ TEST_F(UnicharcompressTest, GetEncodingAsString) { EXPECT_EQ("3\t", lines[4]); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/unicharset_test.cc b/unittest/unicharset_test.cc index 401a34c1..2b8a77a9 100644 --- a/unittest/unicharset_test.cc +++ b/unittest/unicharset_test.cc @@ -9,18 +9,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include "log.h" // for LOG #include "unicharset.h" -#include "gmock/gmock.h" // for testing::ElementsAreArray +#include +#include "gmock/gmock.h" // for testing::ElementsAreArray #include "include_gunit.h" +#include "log.h" // for LOG using testing::ElementsAreArray; namespace tesseract { class UnicharsetTest : public ::testing::Test { - protected: +protected: void SetUp() override { std::locale::global(std::locale("")); } @@ -55,11 +55,9 @@ TEST(UnicharsetTest, Basics) { EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6})); // With the fi ligature encoding fails without a pre-cleanup. std::string lig_str = "af\ufb01ne"; - EXPECT_FALSE( - u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr)); + EXPECT_FALSE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr)); lig_str = u.CleanupString(lig_str.c_str()); - EXPECT_TRUE( - u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr)); + EXPECT_TRUE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr)); v = std::vector(&labels[0], &labels[0] + labels.size()); EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6})); } @@ -76,7 +74,7 @@ TEST(UnicharsetTest, Multibyte) { EXPECT_EQ(u.size(), 5); u.unichar_insert("\u062f"); EXPECT_EQ(u.size(), 6); - u.unichar_insert("\ufb01"); // fi ligature is added as fi pair. + u.unichar_insert("\ufb01"); // fi ligature is added as fi pair. EXPECT_EQ(u.size(), 7); u.unichar_insert("\u062b"); EXPECT_EQ(u.size(), 8); @@ -94,8 +92,8 @@ TEST(UnicharsetTest, Multibyte) { // The fi ligature is findable. EXPECT_EQ(u.unichar_to_id("\ufb01"), 6); std::vector labels; - EXPECT_TRUE(u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true, - &labels, nullptr, nullptr)); + EXPECT_TRUE( + u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true, &labels, nullptr, nullptr)); std::vector v(&labels[0], &labels[0] + labels.size()); EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7})); // With the fi ligature the fi is picked out. @@ -104,8 +102,7 @@ TEST(UnicharsetTest, Multibyte) { std::string src_str = "\u0627\u062c\ufb01\u0635\u062b"; // src_str has to be pre-cleaned for lengths to be correct. std::string cleaned = u.CleanupString(src_str.c_str()); - EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, - &encoded_length)); + EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, &encoded_length)); EXPECT_EQ(encoded_length, cleaned.size()); std::string len_str(&lengths[0], lengths.size()); EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002"); @@ -150,12 +147,11 @@ TEST(UnicharsetTest, MultibyteBigrams) { TEST(UnicharsetTest, OldStyle) { // This test verifies an old unicharset that contains fi/fl ligatures loads // and keeps all the entries. - std::string filename = - file::JoinPath(TESTDATA_DIR, "eng.unicharset"); + std::string filename = file::JoinPath(TESTDATA_DIR, "eng.unicharset"); UNICHARSET u; LOG(INFO) << "Filename=" << filename; EXPECT_TRUE(u.load_from_file(filename.c_str())); EXPECT_EQ(u.size(), 111); } -} // namespace +} // namespace tesseract diff --git a/unittest/util/utf8/unicodetext.cc b/unittest/util/utf8/unicodetext.cc index 1a884dd1..d1174d1a 100644 --- a/unittest/util/utf8/unicodetext.cc +++ b/unittest/util/utf8/unicodetext.cc @@ -16,39 +16,38 @@ #include "util/utf8/unicodetext.h" -#include // for memcpy, NULL, memcmp, etc -#include // for max +#include // for memcpy, NULL, memcmp, etc +#include // for max //#include "base/logging.h" // for operator<<, CHECK, etc //#include "base/stringprintf.h" // for StringPrintf, StringAppendF //#include "strings/stringpiece.h" // for StringPiece, etc -#include "third_party/utf/utf.h" // for isvalidcharntorune, etc -#include "util/utf8/unilib.h" // for IsInterchangeValid, etc -#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen +#include "third_party/utf/utf.h" // for isvalidcharntorune, etc +#include "util/utf8/unilib.h" // for IsInterchangeValid, etc +#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen -static int CodepointDistance(const char* start, const char* end) { +static int CodepointDistance(const char *start, const char *end) { int n = 0; // Increment n on every non-trail-byte. - for (const char* p = start; p < end; ++p) { - n += (*reinterpret_cast(p) >= -0x40); + for (const char *p = start; p < end; ++p) { + n += (*reinterpret_cast(p) >= -0x40); } return n; } -static int CodepointCount(const char* utf8, int len) { +static int CodepointCount(const char *utf8, int len) { return CodepointDistance(utf8, utf8 + len); } -UnicodeText::const_iterator::difference_type -distance(const UnicodeText::const_iterator& first, - const UnicodeText::const_iterator& last) { +UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, + const UnicodeText::const_iterator &last) { return CodepointDistance(first.it_, last.it_); } // ---------- Utility ---------- -static int ConvertToInterchangeValid(char* start, int len) { +static int ConvertToInterchangeValid(char *start, int len) { // This routine is called only when we've discovered that a UTF-8 buffer // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 // was not interchange valid. This indicates a bug in the caller, and @@ -63,9 +62,9 @@ static int ConvertToInterchangeValid(char* start, int len) { // Since the conversion never needs to write more data than it // reads, it is safe to change the buffer in place. It returns the // number of bytes written. - char* const in = start; - char* out = start; - char* const end = start + len; + char *const in = start; + char *out = start; + char *const end = start + len; while (start < end) { int good = UniLib::SpanInterchangeValid(start, end - start); if (good > 0) { @@ -83,16 +82,15 @@ static int ConvertToInterchangeValid(char* start, int len) { int n; if (isvalidcharntorune(start, end - start, &rune, &n)) { // structurally valid UTF8, but not interchange valid - start += n; // Skip over the whole character. - } else { // bad UTF8 - start += 1; // Skip over just one byte + start += n; // Skip over the whole character. + } else { // bad UTF8 + start += 1; // Skip over just one byte } *out++ = ' '; } return out - in; } - // *************** Data representation ********** // Note: the copy constructor is undefined. @@ -101,19 +99,21 @@ static int ConvertToInterchangeValid(char* start, int len) { void UnicodeText::Repr::reserve(int new_capacity) { // If there's already enough capacity, and we're an owner, do nothing. - if (capacity_ >= new_capacity && ours_) return; + if (capacity_ >= new_capacity && ours_) + return; // Otherwise, allocate a new buffer. capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20); - char* new_data = new char[capacity_]; + char *new_data = new char[capacity_]; // If there is an old buffer, copy it into the new buffer. if (data_) { memcpy(new_data, data_, size_); - if (ours_) delete[] data_; // If we owned the old buffer, free it. + if (ours_) + delete[] data_; // If we owned the old buffer, free it. } data_ = new_data; - ours_ = true; // We own the new buffer. + ours_ = true; // We own the new buffer. // size_ is unchanged. } @@ -121,9 +121,11 @@ void UnicodeText::Repr::resize(int new_size) { if (new_size == 0) { clear(); } else { - if (!ours_ || new_size > capacity_) reserve(new_size); + if (!ours_ || new_size > capacity_) + reserve(new_size); // Clear the memory in the expanded part. - if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); + if (size_ < new_size) + memset(data_ + size_, 0, new_size - size_); size_ = new_size; ours_ = true; } @@ -132,120 +134,113 @@ void UnicodeText::Repr::resize(int new_size) { // This implementation of clear() deallocates the buffer if we're an owner. // That's not strictly necessary; we could just set size_ to 0. void UnicodeText::Repr::clear() { - if (ours_) delete[] data_; + if (ours_) + delete[] data_; data_ = nullptr; size_ = capacity_ = 0; ours_ = true; } -void UnicodeText::Repr::Copy(const char* data, int size) { +void UnicodeText::Repr::Copy(const char *data, int size) { resize(size); memcpy(data_, data, size); } -void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { - if (data == data_) return; // We already own this memory. (Weird case.) - if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. +void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) { + if (data == data_) + return; // We already own this memory. (Weird case.) + if (ours_ && data_) + delete[] data_; // If we owned the old buffer, free it. data_ = data; size_ = size; capacity_ = capacity; ours_ = true; } -void UnicodeText::Repr::PointTo(const char* data, int size) { - if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. - data_ = const_cast(data); +void UnicodeText::Repr::PointTo(const char *data, int size) { + if (ours_ && data_) + delete[] data_; // If we owned the old buffer, free it. + data_ = const_cast(data); size_ = size; capacity_ = size; ours_ = false; } -void UnicodeText::Repr::append(const char* bytes, int byte_length) { +void UnicodeText::Repr::append(const char *bytes, int byte_length) { reserve(size_ + byte_length); memcpy(data_ + size_, bytes, byte_length); size_ += byte_length; } string UnicodeText::Repr::DebugString() const { - return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", - this, - data_, size_, capacity_, - ours_ ? "Owned" : "Alias"); + return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_, + capacity_, ours_ ? "Owned" : "Alias"); } - - // *************** UnicodeText ****************** // ----- Constructors ----- // Default constructor -UnicodeText::UnicodeText() { -} +UnicodeText::UnicodeText() {} // Copy constructor -UnicodeText::UnicodeText(const UnicodeText& src) { +UnicodeText::UnicodeText(const UnicodeText &src) { Copy(src); } // Substring constructor -UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, - const UnicodeText::const_iterator& last) { +UnicodeText::UnicodeText(const UnicodeText::const_iterator &first, + const UnicodeText::const_iterator &last) { CHECK(first <= last) << " Incompatible iterators"; repr_.append(first.it_, last.it_ - first.it_); } -string UnicodeText::UTF8Substring(const const_iterator& first, - const const_iterator& last) { +string UnicodeText::UTF8Substring(const const_iterator &first, const const_iterator &last) { CHECK(first <= last) << " Incompatible iterators"; return string(first.it_, last.it_ - first.it_); } - // ----- Copy ----- -UnicodeText& UnicodeText::operator=(const UnicodeText& src) { +UnicodeText &UnicodeText::operator=(const UnicodeText &src) { if (this != &src) { Copy(src); } return *this; } -UnicodeText& UnicodeText::Copy(const UnicodeText& src) { +UnicodeText &UnicodeText::Copy(const UnicodeText &src) { repr_.Copy(src.repr_.data_, src.repr_.size_); return *this; } -UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { +UnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) { repr_.Copy(buffer, byte_length); - if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + if (!UniLib::IsInterchangeValid(buffer, byte_length)) { LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); } return *this; } -UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, - int byte_length) { +UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) { repr_.Copy(buffer, byte_length); return *this; } // ----- TakeOwnershipOf ----- -UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, - int byte_length, - int byte_capacity) { +UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) { repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); - if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + if (!UniLib::IsInterchangeValid(buffer, byte_length)) { LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); } return *this; } -UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, - int byte_length, +UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) { repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); return *this; @@ -253,8 +248,8 @@ UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, // ----- PointTo ----- -UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { - if (UniLib:: IsInterchangeValid(buffer, byte_length)) { +UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) { + if (UniLib::IsInterchangeValid(buffer, byte_length)) { repr_.PointTo(buffer, byte_length); } else { LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; @@ -264,19 +259,17 @@ UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { return *this; } -UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, - int byte_length) { +UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) { repr_.PointTo(buffer, byte_length); return *this; } -UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { +UnicodeText &UnicodeText::PointTo(const UnicodeText &src) { repr_.PointTo(src.repr_.data_, src.repr_.size_); return *this; } -UnicodeText& UnicodeText::PointTo(const const_iterator &first, - const const_iterator &last) { +UnicodeText &UnicodeText::PointTo(const const_iterator &first, const const_iterator &last) { CHECK(first <= last) << " Incompatible iterators"; repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); return *this; @@ -284,47 +277,47 @@ UnicodeText& UnicodeText::PointTo(const const_iterator &first, // ----- Append ----- -UnicodeText& UnicodeText::append(const UnicodeText& u) { +UnicodeText &UnicodeText::append(const UnicodeText &u) { repr_.append(u.repr_.data_, u.repr_.size_); return *this; } -UnicodeText& UnicodeText::append(const const_iterator& first, - const const_iterator& last) { +UnicodeText &UnicodeText::append(const const_iterator &first, const const_iterator &last) { CHECK(first <= last) << " Incompatible iterators"; repr_.append(first.it_, last.it_ - first.it_); return *this; } -UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { +UnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) { repr_.append(utf8, len); return *this; } // ----- substring searching ----- -UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, +UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look, const_iterator start_pos) const { CHECK_GE(start_pos.utf8_data(), utf8_data()); CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length()); return UnsafeFind(look, start_pos); } -UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { +UnicodeText::const_iterator UnicodeText::find(const UnicodeText &look) const { return UnsafeFind(look, begin()); } -UnicodeText::const_iterator UnicodeText::UnsafeFind( - const UnicodeText& look, const_iterator start_pos) const { +UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look, + const_iterator start_pos) const { // Due to the magic of the UTF8 encoding, searching for a sequence of // letters is equivalent to substring search. StringPiece searching(utf8_data(), utf8_length()); StringPiece look_piece(look.utf8_data(), look.utf8_length()); LOG(FATAL) << "Not implemented"; - //StringPiece::size_type found = + // StringPiece::size_type found = // searching.find(look_piece, start_pos.utf8_data() - utf8_data()); StringPiece::size_type found = StringPiece::npos; - if (found == StringPiece::npos) return end(); + if (found == StringPiece::npos) + return end(); return const_iterator(utf8_data() + found); } @@ -336,7 +329,7 @@ bool UnicodeText::HasReplacementChar() const { StringPiece searching(utf8_data(), utf8_length()); StringPiece looking_for("\xEF\xBF\xBD", 3); LOG(FATAL) << "Not implemented"; - //return searching.find(looking_for) != StringPiece::npos; + // return searching.find(looking_for) != StringPiece::npos; return false; } @@ -350,7 +343,6 @@ void UnicodeText::clear() { // Destructor UnicodeText::~UnicodeText() {} - void UnicodeText::push_back(char32 c) { if (UniLib::IsValidCodepoint(c)) { char buf[UTFmax]; @@ -358,8 +350,7 @@ void UnicodeText::push_back(char32 c) { if (UniLib::IsInterchangeValid(buf, len)) { repr_.append(buf, len); } else { - LOG(WARNING) << "Unicode value 0x" << std::hex << c - << " is not valid for interchange"; + LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange"; repr_.append(" ", 1); } } else { @@ -372,20 +363,19 @@ int UnicodeText::size() const { return CodepointCount(repr_.data_, repr_.size_); } -bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { - if (&lhs == &rhs) return true; - if (lhs.repr_.size_ != rhs.repr_.size_) return false; +bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) { + if (&lhs == &rhs) + return true; + if (lhs.repr_.size_ != rhs.repr_.size_) + return false; return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; } string UnicodeText::DebugString() const { - return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", - this, - size(), - repr_.DebugString().c_str()); + return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(), + repr_.DebugString().c_str()); } - // ******************* UnicodeText::const_iterator ********************* // The implementation of const_iterator would be nicer if it @@ -394,12 +384,9 @@ string UnicodeText::DebugString() const { UnicodeText::const_iterator::const_iterator() : it_(nullptr) {} -UnicodeText::const_iterator::const_iterator(const const_iterator& other) - : it_(other.it_) { -} +UnicodeText::const_iterator::const_iterator(const const_iterator &other) : it_(other.it_) {} -UnicodeText::const_iterator& -UnicodeText::const_iterator::operator=(const const_iterator& other) { +UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(const const_iterator &other) { if (&other != this) it_ = other.it_; return *this; @@ -413,8 +400,7 @@ UnicodeText::const_iterator UnicodeText::end() const { return const_iterator(repr_.data_ + repr_.size_); } -bool operator<(const UnicodeText::const_iterator& lhs, - const UnicodeText::const_iterator& rhs) { +bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs) { return lhs.it_ < rhs.it_; } @@ -431,36 +417,37 @@ char32 UnicodeText::const_iterator::operator*() const { unsigned char byte2 = it_[1]; if (byte1 < 0xE0) - return ((byte1 & 0x1F) << 6) - | (byte2 & 0x3F); + return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F); unsigned char byte3 = it_[2]; if (byte1 < 0xF0) - return ((byte1 & 0x0F) << 12) - | ((byte2 & 0x3F) << 6) - | (byte3 & 0x3F); + return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F); unsigned char byte4 = it_[3]; - return ((byte1 & 0x07) << 18) - | ((byte2 & 0x3F) << 12) - | ((byte3 & 0x3F) << 6) - | (byte4 & 0x3F); + return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F); } -UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { +UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() { it_ += UniLib::OneCharLen(it_); return *this; } -UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { - while (UniLib::IsTrailByte(*--it_)); +UnicodeText::const_iterator &UnicodeText::const_iterator::operator--() { + while (UniLib::IsTrailByte(*--it_)) + ; return *this; } -int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { - utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1; - utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2; - utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3; +int UnicodeText::const_iterator::get_utf8(char *utf8_output) const { + utf8_output[0] = it_[0]; + if ((it_[0] & 0xff) < 0x80) + return 1; + utf8_output[1] = it_[1]; + if ((it_[0] & 0xff) < 0xE0) + return 2; + utf8_output[2] = it_[2]; + if ((it_[0] & 0xff) < 0xF0) + return 3; utf8_output[3] = it_[3]; return 4; } @@ -481,11 +468,11 @@ int UnicodeText::const_iterator::utf8_length() const { } } -UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { +UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const { CHECK(p != nullptr); - const char* start = utf8_data(); + const char *start = utf8_data(); int len = utf8_length(); - const char* end = start + len; + const char *end = start + len; CHECK(p >= start); CHECK(p <= end); CHECK(p == end || !UniLib::IsTrailByte(*p)); @@ -496,12 +483,12 @@ string UnicodeText::const_iterator::DebugString() const { return tensorflow::strings::Printf("{iter %p}", it_); } - // *************************** Utilities ************************* -string CodepointString(const UnicodeText& t) { +string CodepointString(const UnicodeText &t) { string s; UnicodeText::const_iterator it = t.begin(), end = t.end(); - while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++); + while (it != end) + tensorflow::strings::Appendf(&s, "%X ", *it++); return s; } diff --git a/unittest/util/utf8/unicodetext.h b/unittest/util/utf8/unicodetext.h index 4e25d3ee..e87c7a91 100644 --- a/unittest/util/utf8/unicodetext.h +++ b/unittest/util/utf8/unicodetext.h @@ -17,10 +17,10 @@ #ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_ #define UTIL_UTF8_PUBLIC_UNICODETEXT_H_ -#include // for NULL, ptrdiff_t -#include // for bidirectional_iterator_tag, etc -#include // for string -#include // for pair +#include // for NULL, ptrdiff_t +#include // for bidirectional_iterator_tag, etc +#include // for string +#include // for pair #include "syntaxnet/base.h" @@ -114,35 +114,38 @@ // efficient matching or display, and others. class UnicodeText { - public: +public: class const_iterator; typedef char32 value_type; // Constructors. These always produce owners. - UnicodeText(); // Create an empty text. - UnicodeText(const UnicodeText& src); // copy constructor + UnicodeText(); // Create an empty text. + UnicodeText(const UnicodeText &src); // copy constructor // Construct a substring (copies the data). - UnicodeText(const const_iterator& first, const const_iterator& last); + UnicodeText(const const_iterator &first, const const_iterator &last); // Assignment operator. This copies the data and produces an owner // unless this == &src, e.g., "x = x;", which is a no-op. - UnicodeText& operator=(const UnicodeText& src); + UnicodeText &operator=(const UnicodeText &src); // x.Copy(y) copies the data from y into x. - UnicodeText& Copy(const UnicodeText& src); - inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } + UnicodeText &Copy(const UnicodeText &src); + inline UnicodeText &assign(const UnicodeText &src) { + return Copy(src); + } // x.PointTo(y) changes x so that it points to y's data. // It does not copy y or take ownership of y's data. - UnicodeText& PointTo(const UnicodeText& src); - UnicodeText& PointTo(const const_iterator& first, - const const_iterator& last); + UnicodeText &PointTo(const UnicodeText &src); + UnicodeText &PointTo(const const_iterator &first, const const_iterator &last); ~UnicodeText(); - void clear(); // Clear text. - bool empty() const { return repr_.size_ == 0; } // Test if text is empty. + void clear(); // Clear text. + bool empty() const { + return repr_.size_ == 0; + } // Test if text is empty. // Add a codepoint to the end of the text. // If the codepoint is not interchange-valid, add a space instead @@ -156,105 +159,115 @@ class UnicodeText { // vector more_chars = ...; // utext.append(chars, chars+arraysize(chars)); // utext.append(more_chars.begin(), more_chars.end()); - template - UnicodeText& append(ForwardIterator first, const ForwardIterator last) { - while (first != last) { push_back(*first++); } + template + UnicodeText &append(ForwardIterator first, const ForwardIterator last) { + while (first != last) { + push_back(*first++); + } return *this; } // A specialization of the generic append() method. - UnicodeText& append(const const_iterator& first, const const_iterator& last); + UnicodeText &append(const const_iterator &first, const const_iterator &last); // An optimization of append(source.begin(), source.end()). - UnicodeText& append(const UnicodeText& source); + UnicodeText &append(const UnicodeText &source); - int size() const; // the number of Unicode characters (codepoints) + int size() const; // the number of Unicode characters (codepoints) - friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); - friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); + friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs); + friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs); class const_iterator { typedef const_iterator CI; - public: + + public: typedef std::bidirectional_iterator_tag iterator_category; typedef char32 value_type; typedef ptrdiff_t difference_type; - typedef void pointer; // (Not needed.) - typedef const char32 reference; // (Needed for const_reverse_iterator) + typedef void pointer; // (Not needed.) + typedef const char32 reference; // (Needed for const_reverse_iterator) // Iterators are default-constructible. const_iterator(); // It's safe to make multiple passes over a UnicodeText. - const_iterator(const const_iterator& other); - const_iterator& operator=(const const_iterator& other); + const_iterator(const const_iterator &other); + const_iterator &operator=(const const_iterator &other); - char32 operator*() const; // Dereference + char32 operator*() const; // Dereference - const_iterator& operator++(); // Advance (++iter) - const_iterator operator++(int) { // (iter++) + const_iterator &operator++(); // Advance (++iter) + const_iterator operator++(int) { // (iter++) const_iterator result(*this); ++*this; return result; } - const_iterator& operator--(); // Retreat (--iter) - const_iterator operator--(int) { // (iter--) + const_iterator &operator--(); // Retreat (--iter) + const_iterator operator--(int) { // (iter--) const_iterator result(*this); --*this; return result; } // We love relational operators. - friend bool operator==(const CI& lhs, const CI& rhs) { - return lhs.it_ == rhs.it_; } - friend bool operator!=(const CI& lhs, const CI& rhs) { - return !(lhs == rhs); } - friend bool operator<(const CI& lhs, const CI& rhs); - friend bool operator>(const CI& lhs, const CI& rhs) { - return rhs < lhs; } - friend bool operator<=(const CI& lhs, const CI& rhs) { - return !(rhs < lhs); } - friend bool operator>=(const CI& lhs, const CI& rhs) { - return !(lhs < rhs); } + friend bool operator==(const CI &lhs, const CI &rhs) { + return lhs.it_ == rhs.it_; + } + friend bool operator!=(const CI &lhs, const CI &rhs) { + return !(lhs == rhs); + } + friend bool operator<(const CI &lhs, const CI &rhs); + friend bool operator>(const CI &lhs, const CI &rhs) { + return rhs < lhs; + } + friend bool operator<=(const CI &lhs, const CI &rhs) { + return !(rhs < lhs); + } + friend bool operator>=(const CI &lhs, const CI &rhs) { + return !(lhs < rhs); + } - friend difference_type distance(const CI& first, const CI& last); + friend difference_type distance(const CI &first, const CI &last); // UTF-8-specific methods // Store the UTF-8 encoding of the current codepoint into buf, // which must be at least 4 bytes long. Return the number of // bytes written. - int get_utf8(char* buf) const; + int get_utf8(char *buf) const; // Return the UTF-8 character that the iterator points to. string get_utf8_string() const; // Return the byte length of the UTF-8 character the iterator points to. int utf8_length() const; // Return the iterator's pointer into the UTF-8 data. - const char* utf8_data() const { return it_; } + const char *utf8_data() const { + return it_; + } string DebugString() const; - private: + private: friend class UnicodeText; friend class UnicodeTextUtils; friend class UTF8StateTableProperty; - explicit const_iterator(const char* it) : it_(it) {} + explicit const_iterator(const char *it) : it_(it) {} - const char* it_; + const char *it_; }; const_iterator begin() const; const_iterator end() const; class const_reverse_iterator : public std::reverse_iterator { - public: - explicit const_reverse_iterator(const_iterator it) : - std::reverse_iterator(it) {} - const char* utf8_data() const { + public: + explicit const_reverse_iterator(const_iterator it) + : std::reverse_iterator(it) {} + const char *utf8_data() const { const_iterator tmp_it = base(); return (--tmp_it).utf8_data(); } - int get_utf8(char* buf) const { + int get_utf8(char *buf) const { const_iterator tmp_it = base(); return (--tmp_it).get_utf8(buf); } @@ -276,9 +289,9 @@ class UnicodeText { // Substring searching. Returns the beginning of the first // occurrence of "look", or end() if not found. - const_iterator find(const UnicodeText& look, const_iterator start_pos) const; + const_iterator find(const UnicodeText &look, const_iterator start_pos) const; // Equivalent to find(look, begin()) - const_iterator find(const UnicodeText& look) const; + const_iterator find(const UnicodeText &look) const; // Returns whether this contains the character U+FFFD. This can // occur, for example, if the input to Encodings::Decode() had byte @@ -289,13 +302,18 @@ class UnicodeText { // // Return the data, length, and capacity of UTF-8-encoded version of // the text. Length and capacity are measured in bytes. - const char* utf8_data() const { return repr_.data_; } - int utf8_length() const { return repr_.size_; } - int utf8_capacity() const { return repr_.capacity_; } + const char *utf8_data() const { + return repr_.data_; + } + int utf8_length() const { + return repr_.size_; + } + int utf8_capacity() const { + return repr_.capacity_; + } // Return the UTF-8 data as a string. - static string UTF8Substring(const const_iterator& first, - const const_iterator& last); + static string UTF8Substring(const const_iterator &first, const const_iterator &last); // There are three methods for initializing a UnicodeText from UTF-8 // data. They vary in details of memory management. In all cases, @@ -305,57 +323,58 @@ class UnicodeText { // is replaced with a space. // x.CopyUTF8(buf, len) copies buf into x. - UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); + UnicodeText &CopyUTF8(const char *utf8_buffer, int byte_length); // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of // buf. buf is not copied. - UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, - int byte_length, - int byte_capacity); + UnicodeText &TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity); // x.PointToUTF8(buf,len) changes x so that it points to buf // ("becomes an alias"). It does not take ownership or copy buf. // If the buffer is not valid, this has the same effect as // CopyUTF8(utf8_buffer, byte_length). - UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); + UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length); // Occasionally it is necessary to use functions that operate on the // pointer returned by utf8_data(). MakeIterator(p) provides a way // to get back to the UnicodeText level. It uses CHECK to ensure // that p is a pointer within this object's UTF-8 data, and that it // points to the beginning of a character. - const_iterator MakeIterator(const char* p) const; + const_iterator MakeIterator(const char *p) const; string DebugString() const; - private: +private: friend class const_iterator; friend class UnicodeTextUtils; - class Repr { // A byte-string. - public: - char* data_; + class Repr { // A byte-string. + public: + char *data_; int size_; int capacity_; - bool ours_; // Do we own data_? + bool ours_; // Do we own data_? Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {} - ~Repr() { if (ours_) delete[] data_; } + ~Repr() { + if (ours_) + delete[] data_; + } void clear(); void reserve(int capacity); void resize(int size); - void append(const char* bytes, int byte_length); - void Copy(const char* data, int size); - void TakeOwnershipOf(char* data, int size, int capacity); - void PointTo(const char* data, int size); + void append(const char *bytes, int byte_length); + void Copy(const char *data, int size); + void TakeOwnershipOf(char *data, int size, int capacity); + void PointTo(const char *data, int size); string DebugString() const; - private: - Repr& operator=(const Repr&); - Repr(const Repr& other); + private: + Repr &operator=(const Repr &); + Repr(const Repr &other); }; Repr repr_; @@ -366,31 +385,27 @@ class UnicodeText { // It is an error to call these methods with UTF-8 data that // is not interchange-valid. // - UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); - UnicodeText& UnsafeTakeOwnershipOfUTF8( - char* utf8_buffer, int byte_length, int byte_capacity); - UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); - UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); - const_iterator UnsafeFind(const UnicodeText& look, - const_iterator start_pos) const; + UnicodeText &UnsafeCopyUTF8(const char *utf8_buffer, int byte_length); + UnicodeText &UnsafeTakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity); + UnicodeText &UnsafePointToUTF8(const char *utf8_buffer, int byte_length); + UnicodeText &UnsafeAppendUTF8(const char *utf8_buffer, int byte_length); + const_iterator UnsafeFind(const UnicodeText &look, const_iterator start_pos) const; }; -bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); +bool operator==(const UnicodeText &lhs, const UnicodeText &rhs); -inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { +inline bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs) { return !(lhs == rhs); } // UnicodeTextRange is a pair of iterators, useful for specifying text // segments. If the iterators are ==, the segment is empty. -typedef pair UnicodeTextRange; +typedef pair UnicodeTextRange; -inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { +inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange &r) { return r.first == r.second; } - // *************************** Utilities ************************* // A factory function for creating a UnicodeText from a buffer of @@ -402,18 +417,17 @@ inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { // replaced with a space, even if the codepoint was represented with a // multibyte sequence in the UTF-8 data. // -inline UnicodeText MakeUnicodeTextAcceptingOwnership( - char* utf8_buffer, int byte_length, int byte_capacity) { - return UnicodeText().TakeOwnershipOfUTF8( - utf8_buffer, byte_length, byte_capacity); +inline UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length, + int byte_capacity) { + return UnicodeText().TakeOwnershipOfUTF8(utf8_buffer, byte_length, byte_capacity); } // A factory function for creating a UnicodeText from a buffer of // UTF-8 data. The new UnicodeText does not take ownership of the // buffer. (It is an "alias.") // -inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( - const char* utf8_buffer, int byte_length) { +inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(const char *utf8_buffer, + int byte_length) { return UnicodeText().PointToUTF8(utf8_buffer, byte_length); } @@ -434,8 +448,7 @@ inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( // made (as if do_copy were true) and coerced to valid UTF-8 by // replacing each invalid byte with a space. // -inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, - bool do_copy) { +inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy) { UnicodeText t; if (do_copy) { t.CopyUTF8(utf8_buf, len); @@ -445,20 +458,20 @@ inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, return t; } -inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { +inline UnicodeText UTF8ToUnicodeText(const string &utf_string, bool do_copy) { return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); } -inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { +inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len) { return UTF8ToUnicodeText(utf8_buf, len, true); } -inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { +inline UnicodeText UTF8ToUnicodeText(const string &utf8_string) { return UTF8ToUnicodeText(utf8_string, true); } // Return a string containing the UTF-8 encoded version of all the // Unicode characters in t. -inline string UnicodeTextToUTF8(const UnicodeText& t) { +inline string UnicodeTextToUTF8(const UnicodeText &t) { return string(t.utf8_data(), t.utf8_length()); } @@ -472,6 +485,6 @@ char (&ArraySizeHelper(T (&array)[N]))[N]; // For debugging. Return a string of integers, written in uppercase // hex (%X), corresponding to the codepoints within the text. Each // integer is followed by a space. E.g., "61 62 6A 3005 ". -string CodepointString(const UnicodeText& t); +string CodepointString(const UnicodeText &t); -#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_ +#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_ diff --git a/unittest/util/utf8/unilib.cc b/unittest/util/utf8/unilib.cc index c00759ae..ddc3f0df 100644 --- a/unittest/util/utf8/unilib.cc +++ b/unittest/util/utf8/unilib.cc @@ -32,27 +32,25 @@ namespace UniLib { // Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx bool IsInterchangeValid(char32 c) { return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || - (c >= 0x7F && c <= 0x9F) || - (c >= 0xD800 && c <= 0xDFFF) || - (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE); + (c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) || + (c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE); } -int SpanInterchangeValid(const char* begin, int byte_length) { +int SpanInterchangeValid(const char *begin, int byte_length) { char32 rune; - const char* p = begin; - const char* end = begin + byte_length; + const char *p = begin; + const char *end = begin + byte_length; while (p < end) { int bytes_consumed = charntorune(&rune, p, end - p); // We want to accept Runeerror == U+FFFD as a valid char, but it is used // by chartorune to indicate error. Luckily, the real codepoint is size 3 // while errors return bytes_consumed <= 1. - if ((rune == Runeerror && bytes_consumed <= 1) || - !IsInterchangeValid(rune)) { - break; // Found + if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) { + break; // Found } p += bytes_consumed; } return p - begin; } -} // namespace UniLib +} // namespace UniLib diff --git a/unittest/util/utf8/unilib.h b/unittest/util/utf8/unilib.h index e99895a2..1f362eb0 100644 --- a/unittest/util/utf8/unilib.h +++ b/unittest/util/utf8/unilib.h @@ -42,8 +42,8 @@ namespace UniLib { // Returns the length in bytes of the prefix of src that is all // interchange valid UTF-8 -int SpanInterchangeValid(const char* src, int byte_length); -inline int SpanInterchangeValid(const std::string& src) { +int SpanInterchangeValid(const char *src, int byte_length); +inline int SpanInterchangeValid(const std::string &src) { return SpanInterchangeValid(src.data(), src.size()); } @@ -51,13 +51,13 @@ inline int SpanInterchangeValid(const std::string& src) { // "Interchange valid" is a stronger than structurally valid -- // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. bool IsInterchangeValid(char32 codepoint); -inline bool IsInterchangeValid(const char* src, int byte_length) { +inline bool IsInterchangeValid(const char *src, int byte_length) { return (byte_length == SpanInterchangeValid(src, byte_length)); } -inline bool IsInterchangeValid(const std::string& src) { +inline bool IsInterchangeValid(const std::string &src) { return IsInterchangeValid(src.data(), src.size()); } -} // namespace UniLib +} // namespace UniLib -#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ +#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ diff --git a/unittest/util/utf8/unilib_utf8_utils.h b/unittest/util/utf8/unilib_utf8_utils.h index a9c10166..f2d1520c 100644 --- a/unittest/util/utf8/unilib_utf8_utils.h +++ b/unittest/util/utf8/unilib_utf8_utils.h @@ -29,8 +29,7 @@ namespace UniLib { // (i.e., is not a surrogate codepoint). See also // IsValidCodepoint(const char* src) in util/utf8/public/unilib.h. inline bool IsValidCodepoint(char32 c) { - return (static_cast(c) < 0xD800) - || (c >= 0xE000 && c <= 0x10FFFF); + return (static_cast(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF); } // Returns true if 'str' is the start of a structurally valid UTF-8 @@ -41,16 +40,15 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) { char32 c; int consumed; // It's OK if str.length() > consumed. - return !str.empty() - && isvalidcharntorune(str.data(), str.size(), &c, &consumed) - && IsValidCodepoint(c); + return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) && + IsValidCodepoint(c); } // Returns the length (number of bytes) of the Unicode code point // starting at src, based on inspecting just that one byte. This // requires that src point to a well-formed UTF-8 string; the result // is undefined otherwise. -inline int OneCharLen(const char* src) { +inline int OneCharLen(const char *src) { return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; } @@ -61,6 +59,6 @@ inline bool IsTrailByte(char x) { return static_cast(x) < -0x40; } -} // namespace UniLib +} // namespace UniLib -#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ +#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ diff --git a/unittest/validate_grapheme_test.cc b/unittest/validate_grapheme_test.cc index 54e2f490..603f84ff 100644 --- a/unittest/validate_grapheme_test.cc +++ b/unittest/validate_grapheme_test.cc @@ -16,11 +16,10 @@ namespace tesseract { TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) { - std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E. + std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E. std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); // It made 3 graphemes. EXPECT_EQ(glyphs.size(), 3); @@ -30,113 +29,106 @@ TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) { } TEST(ValidateGraphemeTest, SingleConsonantOK) { - std::string str = "\u0cb9"; // HA + std::string str = "\u0cb9"; // HA std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); } TEST(ValidateGraphemeTest, SimpleCV) { - std::string str = "\u0cb9\u0cbf"; // HA I + std::string str = "\u0cb9\u0cbf"; // HA I std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); } TEST(ValidateGraphemeTest, SubscriptConjunct) { - std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I + std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95")); } TEST(ValidateGraphemeTest, HalfFormJoiner) { - std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta + std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d")); } TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) { - std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta + std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d")); } TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) { - std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta + std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d")); // Malaylam only, so not allowed in Telugu. - str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta - EXPECT_FALSE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta + EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); } TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) { - std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta + std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 2); EXPECT_EQ(glyphs[1], std::string("\u0d24")); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c")); @@ -146,15 +138,14 @@ TEST(ValidateGraphemeTest, ThaiGraphemes) { // This is a single grapheme unless in glyph split mode std::string str = "\u0e14\u0e38\u0e4a"; std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 1); EXPECT_EQ(glyphs[0], str); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[0], std::string("\u0e14")); @@ -164,9 +155,8 @@ TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) { std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d"; std::vector glyphs; // Returns true, but the joiner is gone. - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)) + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)) << PrintString32WithUnicodes(str); EXPECT_EQ(glyphs.size(), 5); EXPECT_EQ(glyphs[0], std::string("'")); @@ -176,4 +166,4 @@ TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) { EXPECT_EQ(glyphs[4], std::string("'")); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/validate_indic_test.cc b/unittest/validate_indic_test.cc index d317198b..8fa9dab3 100644 --- a/unittest/validate_indic_test.cc +++ b/unittest/validate_indic_test.cc @@ -26,21 +26,19 @@ namespace tesseract { // normalizer always puts a termninating ZWNJ on the end if not present, // and accepts the string as valid. TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) { - std::string str = "\u0c15\u0c4d"; // KA - virama - std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ + std::string str = "\u0c15\u0c4d"; // KA - virama + std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str); // Same result if we started with the normalized string. - ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, - target_str); + ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, target_str); } // Only one dependent vowel is allowed. TEST(ValidateIndicTest, OnlyOneDependentVowel) { - std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU + std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU std::string dest; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &dest)) + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); } @@ -53,24 +51,21 @@ TEST(ValidateIndicTest, OnlyOneDependentVowel) { // References: // http://www.omniglot.com/writing/telugu.htm TEST(ValidateIndicTest, OnlyOneVowelModifier) { - std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu + std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu std::string result; - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); // It made 1 grapheme of 4 chars, by terminating the explicit virama. EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result); - str = "\u0995\u0983\u0981"; // KA visarga candrabindu - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + str = "\u0995\u0983\u0981"; // KA visarga candrabindu + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); // Exception: Malayalam allows multiple anusvara. - str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); EXPECT_EQ(str, result); } @@ -83,16 +78,15 @@ TEST(ValidateIndicTest, OnlyOneVowelModifier) { // and the Microsoft page // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx TEST(ValidateIndicTest, VowelModifierMustBeLast) { - std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I + std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I std::string dest; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &dest)) + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); // Swap c02/c3f and all is ok. - str = "\u0c28\u0c3f\u0c02"; // NA I Sunna - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), &dest)) + str = "\u0c28\u0c3f\u0c02"; // NA I Sunna + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } @@ -106,100 +100,98 @@ TEST(ValidateIndicTest, VowelModifierMustBeLast) { // Principles of the Devanagari Script: Dependent Vowel Signs (Matras). // + http://varamozhi.sourceforge.net/iscii91.pdf TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) { - std::string str = "\u0c05\u0c47"; // A EE + std::string str = "\u0c05\u0c47"; // A EE std::string dest; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &dest)) + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); - str = "\u0c1e\u0c3e"; // NYA AA - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), &dest)) + str = "\u0c1e\u0c3e"; // NYA AA + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } // Sub-graphemes are allowed if GraphemeNorm is turned off. TEST(ValidateIndicTest, SubGraphemes) { - std::string str = "\u0d3e"; // AA + std::string str = "\u0d3e"; // AA std::string dest; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &dest)) + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNone, str.c_str(), &dest)) + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } TEST(ValidateIndicTest, Nukta) { - std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA + std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9")); // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it. - std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA + std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str); } // Sinhala has some of its own specific rules. See www.macciato.com/sinhala TEST(ValidateIndicTest, SinhalaRakaransaya) { - std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna + std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna std::string dest; - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), &dest)) + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 2); EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb")); // Can be followed by a dependent vowel. - str += "\u0dd9"; // E - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), &dest)) + str += "\u0dd9"; // E + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } TEST(ValidateIndicTest, SinhalaYansaya) { - std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna + std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna std::string dest; - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), &dest)) + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); // Can be followed by a dependent vowel. - str += "\u0ddd"; // OO - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), &dest)) + str += "\u0ddd"; // OO + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba")); } TEST(ValidateIndicTest, SinhalaRepaya) { - std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA + std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 2); EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8")); - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d")); } @@ -208,9 +200,9 @@ TEST(ValidateIndicTest, SinhalaSpecials) { // Sinhala has some exceptions from the usual rules. std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d"; std::vector glyphs; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(glyphs[0], std::string("\u0dc0")); EXPECT_EQ(glyphs[1], std::string("\u0d9c")); @@ -218,9 +210,9 @@ TEST(ValidateIndicTest, SinhalaSpecials) { EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d")); EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d")); str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf"; - EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(glyphs[0], std::string("\u0dc3")); EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d")); @@ -228,4 +220,4 @@ TEST(ValidateIndicTest, SinhalaSpecials) { EXPECT_EQ(glyphs[3], std::string("\u0dcf")); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/validate_khmer_test.cc b/unittest/validate_khmer_test.cc index 74b87e61..bb47b42f 100644 --- a/unittest/validate_khmer_test.cc +++ b/unittest/validate_khmer_test.cc @@ -32,19 +32,16 @@ TEST(ValidateKhmerTest, BadKhmerWords) { std::string result; // Multiple dependent vowels not allowed std::string str = "\u1796\u17b6\u17b7"; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); // Multiple shifters not allowed str = "\u1798\u17c9\u17ca"; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); // Multiple signs not allowed str = "\u1780\u17b6\u17cb\u17cd"; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/validate_myanmar_test.cc b/unittest/validate_myanmar_test.cc index 262e04b6..8f7a39ae 100644 --- a/unittest/validate_myanmar_test.cc +++ b/unittest/validate_myanmar_test.cc @@ -17,7 +17,7 @@ namespace tesseract { // Test some random Myanmar words. TEST(ValidateMyanmarTest, GoodMyanmarWords) { - std::string str = "လျှာကသိသည် "; // No viramas in this one. + std::string str = "လျှာကသိသည် "; // No viramas in this one. ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 11, 5, str); str = "တုန္လႈပ္မႈ "; ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 9, 4, str); @@ -27,28 +27,26 @@ TEST(ValidateMyanmarTest, GoodMyanmarWords) { TEST(ValidateMyanmarTest, BadMyanmarWords) { std::string str = "က်န္းမာေရး"; std::vector glyphs; - EXPECT_FALSE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, - str.c_str(), &glyphs)); + EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, str.c_str(), + &glyphs)); std::string result; - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); // It works if the grapheme normalization is turned off. - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNone, str.c_str(), &result)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone, + str.c_str(), &result)); EXPECT_EQ(str, result); str = "ခုႏွစ္"; - EXPECT_FALSE(NormalizeCleanAndSegmentUTF8( - UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, - true, str.c_str(), &glyphs)); - EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNormalize, str.c_str(), - &result)); + EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, str.c_str(), + &glyphs)); + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, + str.c_str(), &result)); // It works if the grapheme normalization is turned off. - EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, - GraphemeNorm::kNone, str.c_str(), &result)); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone, + str.c_str(), &result)); EXPECT_EQ(str, result); } -} // namespace tesseract +} // namespace tesseract diff --git a/unittest/validator_test.cc b/unittest/validator_test.cc index 84cb42af..14f02863 100644 --- a/unittest/validator_test.cc +++ b/unittest/validator_test.cc @@ -11,15 +11,14 @@ #include "validator.h" -#include "gmock/gmock.h" // for testing::ElementsAreArray +#include "gmock/gmock.h" // for testing::ElementsAreArray #include "include_gunit.h" namespace tesseract { class TestableValidator : public Validator { - public: - static ViramaScript TestableMostFrequentViramaScript( - const std::vector& utf32) { +public: + static ViramaScript TestableMostFrequentViramaScript(const std::vector &utf32) { return MostFrequentViramaScript(utf32); } }; @@ -29,48 +28,40 @@ class TestableValidator : public Validator { TEST(ValidatorTest, MostFrequentViramaScript) { // The most frequent virama script should come out correct, despite // distractions from other scripts. - EXPECT_EQ(ViramaScript::kTelugu, - TestableValidator::TestableMostFrequentViramaScript({0xc05})); + EXPECT_EQ(ViramaScript::kTelugu, TestableValidator::TestableMostFrequentViramaScript({0xc05})); // It is still Telugu surrounded by Latin. EXPECT_EQ(ViramaScript::kTelugu, - TestableValidator::TestableMostFrequentViramaScript( - {'a', 0xc05, 'b', 'c'})); + TestableValidator::TestableMostFrequentViramaScript({'a', 0xc05, 'b', 'c'})); // But not still Telugu surrounded by Devanagari. EXPECT_EQ(ViramaScript::kDevanagari, - TestableValidator::TestableMostFrequentViramaScript( - {0x905, 0xc05, 0x906, 0x907})); + TestableValidator::TestableMostFrequentViramaScript({0x905, 0xc05, 0x906, 0x907})); EXPECT_EQ(ViramaScript::kKannada, - TestableValidator::TestableMostFrequentViramaScript( - {0xc85, 0xc05, 0xc86, 0xc87})); + TestableValidator::TestableMostFrequentViramaScript({0xc85, 0xc05, 0xc86, 0xc87})); EXPECT_EQ(ViramaScript::kBengali, - TestableValidator::TestableMostFrequentViramaScript( - {0x985, 0xc05, 0x986, 0x987})); + TestableValidator::TestableMostFrequentViramaScript({0x985, 0xc05, 0x986, 0x987})); // Danda and double Danda don't count as Devanagari, as they are common. EXPECT_EQ(ViramaScript::kTelugu, - TestableValidator::TestableMostFrequentViramaScript( - {0x964, 0xc05, 0x965, 0x965})); + TestableValidator::TestableMostFrequentViramaScript({0x964, 0xc05, 0x965, 0x965})); } // ValidateCleanAndSegment doesn't modify the input by much, but its // transformation should be idempotent. (Doesn't change again if re-applied.) TEST(ValidatorTest, Idempotency) { - std::vector str1( - {0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c}); - std::vector str2( - {0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''}); + std::vector str1({0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c}); + std::vector str2({0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''}); std::vector> result1, result2, result3, result4; - EXPECT_TRUE(Validator::ValidateCleanAndSegment( - GraphemeNormMode::kSingleString, true, str1, &result1)); - EXPECT_TRUE(Validator::ValidateCleanAndSegment( - GraphemeNormMode::kSingleString, true, result1[0], &result2)); + EXPECT_TRUE( + Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str1, &result1)); + EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result1[0], + &result2)); EXPECT_EQ(result1.size(), result2.size()); EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0])); - EXPECT_TRUE(Validator::ValidateCleanAndSegment( - GraphemeNormMode::kSingleString, true, str2, &result3)); - EXPECT_TRUE(Validator::ValidateCleanAndSegment( - GraphemeNormMode::kSingleString, true, result3[0], &result4)); + EXPECT_TRUE( + Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str2, &result3)); + EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result3[0], + &result4)); EXPECT_EQ(result3.size(), result4.size()); EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0])); } -} // namespace tesseract +} // namespace tesseract