mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 12:49:35 +08:00
[clang-format] Format unit tests.
This commit is contained in:
parent
618b185d14
commit
1d5b083447
@ -22,21 +22,23 @@
|
||||
// expects clone of tessdata_fast repo in ../../tessdata_fast
|
||||
|
||||
//#include "log.h"
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <time.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <locale>
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <string>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include "include_gunit.h"
|
||||
#include <allheaders.h>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class QuickTest : public testing::Test {
|
||||
protected:
|
||||
virtual void SetUp() { start_time_ = time(nullptr); }
|
||||
virtual void SetUp() {
|
||||
start_time_ = time(nullptr);
|
||||
}
|
||||
virtual void TearDown() {
|
||||
#ifndef NDEBUG
|
||||
// Debug builds can be very slow, so allow 4 min for OCR of a test image.
|
||||
@ -49,71 +51,62 @@ class QuickTest : public testing::Test {
|
||||
#endif
|
||||
const time_t end_time = time(nullptr);
|
||||
EXPECT_TRUE(end_time - start_time_ <= MAX_SECONDS_FOR_TEST)
|
||||
<< "The test took too long - "
|
||||
<< ::testing::PrintToString(end_time - start_time_);
|
||||
<< "The test took too long - " << ::testing::PrintToString(end_time - start_time_);
|
||||
}
|
||||
time_t start_time_;
|
||||
};
|
||||
|
||||
void OCRTester(const char* imgname, const char* groundtruth,
|
||||
const char* tessdatadir, const char* lang) {
|
||||
void OCRTester(const char *imgname, const char *groundtruth, const char *tessdatadir,
|
||||
const char *lang) {
|
||||
// log.info() << tessdatadir << " for language: " << lang << std::endl;
|
||||
char *outText;
|
||||
std::locale loc("C"); // You can also use "" for the default system locale
|
||||
std::ifstream file(groundtruth);
|
||||
file.imbue(loc); // Use it for file input
|
||||
std::string gtText((std::istreambuf_iterator<char>(file)),
|
||||
std::istreambuf_iterator<char>());
|
||||
std::string gtText((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang))
|
||||
<< "Could not initialize tesseract.";
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract.";
|
||||
Pix *image = pixRead(imgname);
|
||||
ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
|
||||
api->SetImage(image);
|
||||
outText = api->GetUTF8Text();
|
||||
EXPECT_EQ(gtText, outText)
|
||||
<< "Phototest.tif OCR does not match ground truth for "
|
||||
EXPECT_EQ(gtText, outText) << "Phototest.tif OCR does not match ground truth for "
|
||||
<< ::testing::PrintToString(lang);
|
||||
api->End();
|
||||
delete[] outText;
|
||||
pixDestroy(&image);
|
||||
}
|
||||
|
||||
class MatchGroundTruth : public QuickTest,
|
||||
public ::testing::WithParamInterface<const char*> {};
|
||||
class MatchGroundTruth : public QuickTest, public ::testing::WithParamInterface<const char *> {};
|
||||
|
||||
TEST_P(MatchGroundTruth, FastPhototestOCR) {
|
||||
OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
|
||||
TESSDATA_DIR "_fast", GetParam());
|
||||
OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", TESSDATA_DIR "_fast",
|
||||
GetParam());
|
||||
}
|
||||
|
||||
TEST_P(MatchGroundTruth, BestPhototestOCR) {
|
||||
OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
|
||||
TESSDATA_DIR "_best", GetParam());
|
||||
OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", TESSDATA_DIR "_best",
|
||||
GetParam());
|
||||
}
|
||||
|
||||
TEST_P(MatchGroundTruth, TessPhototestOCR) {
|
||||
OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
|
||||
TESSDATA_DIR, GetParam());
|
||||
OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", TESSDATA_DIR, GetParam());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(Eng, MatchGroundTruth, ::testing::Values("eng"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth,
|
||||
::testing::Values("script/Latin"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth,
|
||||
::testing::Values("script/Devanagari"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth,
|
||||
::testing::Values("script/Arabic"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth, ::testing::Values("script/Latin"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth, ::testing::Values("script/Devanagari"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth, ::testing::Values("script/Arabic"));
|
||||
|
||||
class EuroText : public QuickTest {};
|
||||
|
||||
TEST_F(EuroText, FastLatinOCR) {
|
||||
OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt",
|
||||
TESSDATA_DIR "_fast", "script/Latin");
|
||||
OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt", TESSDATA_DIR "_fast",
|
||||
"script/Latin");
|
||||
}
|
||||
|
||||
// script/Latin for eurotext.tif does not match groundtruth
|
||||
// for tessdata & tessdata_best.
|
||||
// so do not test these here.
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,12 +9,12 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string>
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/resultiterator.h>
|
||||
#include <string>
|
||||
#include "boxread.h"
|
||||
#include "rect.h"
|
||||
#include <tesseract/resultiterator.h>
|
||||
|
||||
#include "include_gunit.h"
|
||||
|
||||
@ -29,10 +29,16 @@ class ApplyBoxTest : public testing::Test {
|
||||
std::string TestDataNameToPath(const std::string &name) {
|
||||
return file::JoinPath(TESTING_DIR, name);
|
||||
}
|
||||
std::string TessdataPath() { return TESSDATA_DIR; }
|
||||
std::string TessdataPath() {
|
||||
return TESSDATA_DIR;
|
||||
}
|
||||
|
||||
ApplyBoxTest() { src_pix_ = nullptr; }
|
||||
~ApplyBoxTest() { pixDestroy(&src_pix_); }
|
||||
ApplyBoxTest() {
|
||||
src_pix_ = nullptr;
|
||||
}
|
||||
~ApplyBoxTest() {
|
||||
pixDestroy(&src_pix_);
|
||||
}
|
||||
|
||||
bool SetImage(const char *filename) {
|
||||
bool found = false;
|
||||
@ -53,8 +59,8 @@ class ApplyBoxTest : public testing::Test {
|
||||
// the boxes match the given box file well enough.
|
||||
// If line_mode is true, ApplyBoxes is run in line segmentation mode,
|
||||
// otherwise the input box file is assumed to have character-level boxes.
|
||||
void VerifyBoxesAndText(const char* imagefile, const char* truth_str,
|
||||
const char* target_box_file, bool line_mode) {
|
||||
void VerifyBoxesAndText(const char *imagefile, const char *truth_str, const char *target_box_file,
|
||||
bool line_mode) {
|
||||
if (!SetImage(imagefile)) {
|
||||
// eng.traineddata not found or other problem during Init.
|
||||
GTEST_SKIP();
|
||||
@ -77,14 +83,12 @@ class ApplyBoxTest : public testing::Test {
|
||||
ResultIterator *it = api_.GetIterator();
|
||||
do {
|
||||
int left, top, right, bottom;
|
||||
EXPECT_TRUE(
|
||||
it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom));
|
||||
EXPECT_TRUE(it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom));
|
||||
TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top));
|
||||
int line_number = 0;
|
||||
TBOX truth_box;
|
||||
STRING box_text;
|
||||
EXPECT_TRUE(
|
||||
ReadNextBox(0, &line_number, box_file, &box_text, &truth_box));
|
||||
EXPECT_TRUE(ReadNextBox(0, &line_number, box_file, &box_text, &truth_box));
|
||||
// Testing for major overlap is a bit weak, but if they all
|
||||
// major overlap successfully, then it has to be fairly close.
|
||||
EXPECT_TRUE(ocr_box.major_overlap(truth_box));
|
||||
@ -103,26 +107,22 @@ class ApplyBoxTest : public testing::Test {
|
||||
|
||||
// Tests character-level applyboxes on normal Times New Roman.
|
||||
TEST_F(ApplyBoxTest, TimesCharLevel) {
|
||||
VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box",
|
||||
false);
|
||||
VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box", false);
|
||||
}
|
||||
|
||||
// Tests character-level applyboxes on italic Times New Roman.
|
||||
TEST_F(ApplyBoxTest, ItalicCharLevel) {
|
||||
VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box",
|
||||
false);
|
||||
VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box", false);
|
||||
}
|
||||
|
||||
// Tests line-level applyboxes on normal Times New Roman.
|
||||
TEST_F(ApplyBoxTest, TimesLineLevel) {
|
||||
VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine,
|
||||
"trainingtimes.box", true);
|
||||
VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine, "trainingtimes.box", true);
|
||||
}
|
||||
|
||||
// Tests line-level applyboxes on italic Times New Roman.
|
||||
TEST_F(ApplyBoxTest, ItalLineLevel) {
|
||||
VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box",
|
||||
true);
|
||||
VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box", true);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -34,12 +34,11 @@ using ::testing::ContainsRegex;
|
||||
using ::testing::HasSubstr;
|
||||
|
||||
static const char *langs[] = {"eng", "vie", "hin", "ara", nullptr};
|
||||
static const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif",
|
||||
"arabic.tif", nullptr};
|
||||
static const char *image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif", "arabic.tif",
|
||||
nullptr};
|
||||
static const char *gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
|
||||
"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
|
||||
"\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a",
|
||||
nullptr};
|
||||
"\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
|
||||
|
||||
class FriendlyTessBaseAPI : public tesseract::TessBaseAPI {
|
||||
FRIEND_TEST(TesseractTest, LSTMGeometryTest);
|
||||
@ -74,8 +73,8 @@ TEST_F(TesseractTest, BasicTesseractTest) {
|
||||
Pix *src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
|
||||
CHECK(src_pix);
|
||||
ocr_text = GetCleanedTextResult(&api, src_pix);
|
||||
CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
|
||||
&truth_text, file::Defaults()));
|
||||
CHECK_OK(
|
||||
file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
|
||||
absl::StripAsciiWhitespace(&truth_text);
|
||||
EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
|
||||
pixDestroy(&src_pix);
|
||||
@ -149,7 +148,8 @@ TEST_F(TesseractTest, HOCRContainsBaseline) {
|
||||
char *result = api.GetHOCRText(0);
|
||||
EXPECT_TRUE(result != nullptr);
|
||||
EXPECT_THAT(result, HasSubstr("Hello"));
|
||||
EXPECT_TRUE(std::regex_search(result, std::regex{ "<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+" }));
|
||||
EXPECT_TRUE(std::regex_search(
|
||||
result, std::regex{"<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+"}));
|
||||
|
||||
delete[] result;
|
||||
pixDestroy(&src_pix);
|
||||
@ -161,13 +161,12 @@ TEST_F(TesseractTest, AdaptToWordStrTest) {
|
||||
// Skip test because TessBaseAPI::AdaptToWordStr is missing.
|
||||
GTEST_SKIP();
|
||||
#else
|
||||
static const char* kTrainingPages[] = {
|
||||
"136.tif", "256.tif", "410.tif", "432.tif", "540.tif",
|
||||
static const char *kTrainingPages[] = {"136.tif", "256.tif", "410.tif", "432.tif", "540.tif",
|
||||
"692.tif", "779.tif", "793.tif", "808.tif", "815.tif",
|
||||
"12.tif", "12.tif", nullptr};
|
||||
static const char* kTrainingText[] = {
|
||||
"1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0", "6 9 2", "7 7 9",
|
||||
"7 9 3", "8 0 8", "8 1 5", "1 2", "1 2", nullptr};
|
||||
static const char *kTrainingText[] = {"1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0",
|
||||
"6 9 2", "7 7 9", "7 9 3", "8 0 8", "8 1 5",
|
||||
"1 2", "1 2", nullptr};
|
||||
static const char *kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr};
|
||||
static const char *kTestText[] = {"324", "433", "12", nullptr};
|
||||
tesseract::TessBaseAPI api;
|
||||
@ -186,10 +185,8 @@ TEST_F(TesseractTest, AdaptToWordStrTest) {
|
||||
Pix *src_pix = pixRead(image_file.c_str());
|
||||
CHECK(src_pix);
|
||||
api.SetImage(src_pix);
|
||||
EXPECT_TRUE(
|
||||
api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))
|
||||
<< "Failed to adapt to text \"" << kTrainingText[i] << "\" on image "
|
||||
<< image_file;
|
||||
EXPECT_TRUE(api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))
|
||||
<< "Failed to adapt to text \"" << kTrainingText[i] << "\" on image " << image_file;
|
||||
pixDestroy(&src_pix);
|
||||
}
|
||||
// Test the test text.
|
||||
@ -219,8 +216,8 @@ TEST_F(TesseractTest, BasicLSTMTest) {
|
||||
Pix *src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
|
||||
CHECK(src_pix);
|
||||
ocr_text = GetCleanedTextResult(&api, src_pix);
|
||||
CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
|
||||
&truth_text, file::Defaults()));
|
||||
CHECK_OK(
|
||||
file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
|
||||
absl::StripAsciiWhitespace(&truth_text);
|
||||
EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
|
||||
pixDestroy(&src_pix);
|
||||
@ -250,8 +247,7 @@ TEST_F(TesseractTest, LSTMGeometryTest) {
|
||||
CHECK(block);
|
||||
|
||||
// extract word and character boxes for each word
|
||||
for (page_res_it.restart_page(); page_res_it.word() != nullptr;
|
||||
page_res_it.forward()) {
|
||||
for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
|
||||
WERD_RES *word = page_res_it.word();
|
||||
CHECK(word);
|
||||
CHECK(word->best_choice);
|
||||
@ -285,11 +281,9 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
|
||||
for (size_t i = 0; i < countof(langs); ++i) {
|
||||
api.reset(new tesseract::TessBaseAPI);
|
||||
timer.Restart();
|
||||
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
|
||||
tesseract::OEM_TESSERACT_ONLY));
|
||||
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], tesseract::OEM_TESSERACT_ONLY));
|
||||
timer.Stop();
|
||||
LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs()
|
||||
<< "ms in regular init";
|
||||
LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() << "ms in regular init";
|
||||
}
|
||||
// Init variables to set for config-only initialization.
|
||||
std::vector<std::string> vars_vec, vars_values;
|
||||
@ -299,12 +293,10 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
|
||||
for (size_t i = 0; i < countof(langs); ++i) {
|
||||
api.reset(new tesseract::TessBaseAPI);
|
||||
timer.Restart();
|
||||
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
|
||||
tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec,
|
||||
&vars_values, false));
|
||||
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], tesseract::OEM_TESSERACT_ONLY, nullptr,
|
||||
0, &vars_vec, &vars_values, false));
|
||||
timer.Stop();
|
||||
LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs()
|
||||
<< "ms in config-only init";
|
||||
LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() << "ms in config-only init";
|
||||
}
|
||||
}
|
||||
|
||||
@ -316,7 +308,8 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
|
||||
// OEM_DEFAULT mode.
|
||||
TEST(TesseractInstanceTest, TestMultipleTessInstances) {
|
||||
int num_langs = 0;
|
||||
while (langs[num_langs] != nullptr) ++num_langs;
|
||||
while (langs[num_langs] != nullptr)
|
||||
++num_langs;
|
||||
|
||||
const std::string kTessdataPath = TESSDATA_DIR;
|
||||
|
||||
@ -351,7 +344,8 @@ TEST(TesseractInstanceTest, TestMultipleTessInstances) {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_langs; ++i) pixDestroy(&pix[i]);
|
||||
for (int i = 0; i < num_langs; ++i)
|
||||
pixDestroy(&pix[i]);
|
||||
}
|
||||
|
||||
// Tests whether Tesseract parameters are correctly set for the two instances.
|
||||
@ -391,12 +385,11 @@ TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
|
||||
bool boolvar;
|
||||
EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar));
|
||||
EXPECT_EQ(bool_param[i], boolvar);
|
||||
EXPECT_STREQ(str_param[i].c_str(),
|
||||
api->GetStringVariable(str_param_name.c_str()));
|
||||
EXPECT_STREQ(str_param[i].c_str(), api->GetStringVariable(str_param_name.c_str()));
|
||||
double doublevar;
|
||||
EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar));
|
||||
EXPECT_EQ(double_param[i], doublevar);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -26,11 +26,11 @@
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
# include <tensorflow/core/lib/core/threadpool.h>
|
||||
#endif
|
||||
#include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace
|
||||
#include <allheaders.h>
|
||||
#include "include_gunit.h"
|
||||
#include <tesseract/baseapi.h>
|
||||
#include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace
|
||||
#include "commandlineflags.h"
|
||||
#include "include_gunit.h"
|
||||
#include "log.h"
|
||||
|
||||
// Run with Tesseract instances.
|
||||
@ -54,13 +54,11 @@ namespace tesseract {
|
||||
|
||||
static const char *kTessLangs[] = {"eng", "vie", nullptr};
|
||||
static const char *kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr};
|
||||
static const char* kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
|
||||
nullptr};
|
||||
static const char *kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", nullptr};
|
||||
|
||||
static const char *kCubeLangs[] = {"hin", "ara", nullptr};
|
||||
static const char *kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr};
|
||||
static const char* kCubeTruthText[] = {
|
||||
"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
|
||||
static const char *kCubeTruthText[] = {"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
|
||||
"\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
|
||||
|
||||
class BaseapiThreadTest : public ::testing::Test {
|
||||
@ -105,8 +103,7 @@ class BaseapiThreadTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
pool_size_ = (FLAGS_max_concurrent_instances < 1)
|
||||
? num_langs_ * FLAGS_reps
|
||||
pool_size_ = (FLAGS_max_concurrent_instances < 1) ? num_langs_ * FLAGS_reps
|
||||
: FLAGS_max_concurrent_instances;
|
||||
#endif
|
||||
}
|
||||
@ -119,10 +116,13 @@ class BaseapiThreadTest : public ::testing::Test {
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
void ResetPool() {
|
||||
pool_.reset(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
|
||||
pool_.reset(
|
||||
new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
|
||||
}
|
||||
|
||||
void WaitForPoolWorkers() { pool_.reset(nullptr); }
|
||||
void WaitForPoolWorkers() {
|
||||
pool_.reset(nullptr);
|
||||
}
|
||||
|
||||
std::unique_ptr<tensorflow::thread::ThreadPool> pool_;
|
||||
static int pool_size_;
|
||||
@ -167,7 +167,8 @@ static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lan
|
||||
std::string ocr_text;
|
||||
GetCleanedText(tess_local, pix, &ocr_text);
|
||||
EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());
|
||||
if (tess_local != tess) delete tess_local;
|
||||
if (tess_local != tess)
|
||||
delete tess_local;
|
||||
}
|
||||
|
||||
// Check that Tesseract/Cube produce the correct results in single-threaded
|
||||
@ -178,8 +179,7 @@ TEST_F(BaseapiThreadTest, TestBasicSanity) {
|
||||
InitTessInstance(&tess, langs_[i]);
|
||||
std::string ocr_text;
|
||||
GetCleanedText(&tess, pix_[i], &ocr_text);
|
||||
CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0)
|
||||
<< "Failed with lang = " << langs_[i];
|
||||
CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) << "Failed with lang = " << langs_[i];
|
||||
}
|
||||
}
|
||||
|
||||
@ -208,8 +208,8 @@ TEST_F(BaseapiThreadTest, TestRecognition) {
|
||||
|
||||
ResetPool();
|
||||
for (int i = 0; i < n; ++i) {
|
||||
pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i],
|
||||
langs_[i % num_langs_], gt_text_[i % num_langs_]));
|
||||
pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], langs_[i % num_langs_],
|
||||
gt_text_[i % num_langs_]));
|
||||
}
|
||||
WaitForPoolWorkers();
|
||||
#endif
|
||||
@ -220,10 +220,10 @@ TEST_F(BaseapiThreadTest, TestAll) {
|
||||
const int n = num_langs_ * FLAGS_reps;
|
||||
ResetPool();
|
||||
for (int i = 0; i < n; ++i) {
|
||||
pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i],
|
||||
langs_[i % num_langs_], gt_text_[i % num_langs_]));
|
||||
pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], langs_[i % num_langs_],
|
||||
gt_text_[i % num_langs_]));
|
||||
}
|
||||
WaitForPoolWorkers();
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -38,11 +38,13 @@ class BitVectorTest : public testing::Test {
|
||||
TestAll(*map, false);
|
||||
map->SetBit(2);
|
||||
// Set all the odds to true.
|
||||
for (int i = 3; i <= kPrimeLimit; i += 2) map->SetValue(i, true);
|
||||
for (int i = 3; i <= kPrimeLimit; i += 2)
|
||||
map->SetValue(i, true);
|
||||
int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));
|
||||
for (int f = 3; f <= factor_limit; f += 2) {
|
||||
if (map->At(f)) {
|
||||
for (int m = 2; m * f <= kPrimeLimit; ++m) map->ResetBit(f * m);
|
||||
for (int m = 2; m * f <= kPrimeLimit; ++m)
|
||||
map->ResetBit(f * m);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -53,7 +55,8 @@ class BitVectorTest : public testing::Test {
|
||||
// of which is 997.
|
||||
int total_primes = 0;
|
||||
for (int i = 0; i <= kPrimeLimit; ++i) {
|
||||
if (map[i]) ++total_primes;
|
||||
if (map[i])
|
||||
++total_primes;
|
||||
}
|
||||
EXPECT_EQ(168, total_primes);
|
||||
EXPECT_TRUE(map[997]);
|
||||
@ -163,4 +166,4 @@ TEST_F(BitVectorTest, TestNumSetBits) {
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -14,8 +14,7 @@
|
||||
#include <tesseract/capi.h>
|
||||
|
||||
// Verifies that the libtesseract library has C API symbols.
|
||||
int main()
|
||||
{
|
||||
int main() {
|
||||
printf("%s\n", TessVersion());
|
||||
return 0;
|
||||
}
|
||||
|
@ -16,4 +16,6 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
// Verifies that the libtesseract library has C API symbols.
|
||||
TEST(C, VersionTest) { TessVersion(); }
|
||||
TEST(C, VersionTest) {
|
||||
TessVersion();
|
||||
}
|
||||
|
@ -23,6 +23,8 @@ enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD };
|
||||
namespace tesseract {
|
||||
|
||||
// Verifies that the global namespace is clean.
|
||||
TEST(CleanNamespaceTess, DummyTest) { tesseract::TessBaseAPI api; }
|
||||
TEST(CleanNamespaceTess, DummyTest) {
|
||||
tesseract::TessBaseAPI api;
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -73,4 +73,4 @@ TEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) {
|
||||
EXPECT_TRUE(b.IsInSameColumnAs(a));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -42,8 +42,7 @@ class CommandlineflagsTest : public ::testing::Test {
|
||||
};
|
||||
|
||||
TEST_F(CommandlineflagsTest, RemoveFlags) {
|
||||
const char* const_argv[] = {"Progname", "--foo_int", "3", "file1.h",
|
||||
"file2.h"};
|
||||
const char *const_argv[] = {"Progname", "--foo_int", "3", "file1.h", "file2.h"};
|
||||
int argc = countof(const_argv);
|
||||
char **argv = const_cast<char **>(const_argv);
|
||||
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
||||
@ -77,16 +76,16 @@ TEST_F(CommandlineflagsTest, ParseIntegerFlags) {
|
||||
EXPECT_EQ(-4, FLAGS_bar_int);
|
||||
|
||||
const char *arg_no_value[] = {"", "--bar_int"};
|
||||
EXPECT_EXIT(TestParser(countof(arg_no_value), arg_no_value),
|
||||
::testing::ExitedWithCode(1), "ERROR");
|
||||
EXPECT_EXIT(TestParser(countof(arg_no_value), arg_no_value), ::testing::ExitedWithCode(1),
|
||||
"ERROR");
|
||||
|
||||
const char *arg_invalid_value[] = {"", "--bar_int", "--foo_int=3"};
|
||||
EXPECT_EXIT(TestParser(countof(arg_invalid_value), arg_invalid_value),
|
||||
::testing::ExitedWithCode(1), "ERROR");
|
||||
|
||||
const char *arg_bad_format[] = {"", "--bar_int="};
|
||||
EXPECT_EXIT(TestParser(countof(arg_bad_format), arg_bad_format),
|
||||
::testing::ExitedWithCode(1), "ERROR");
|
||||
EXPECT_EXIT(TestParser(countof(arg_bad_format), arg_bad_format), ::testing::ExitedWithCode(1),
|
||||
"ERROR");
|
||||
}
|
||||
|
||||
TEST_F(CommandlineflagsTest, ParseDoubleFlags) {
|
||||
@ -97,12 +96,10 @@ TEST_F(CommandlineflagsTest, ParseDoubleFlags) {
|
||||
EXPECT_EQ(1.2, FLAGS_bar_double);
|
||||
|
||||
const char *arg_no_value[] = {"", "--bar_double"};
|
||||
EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1),
|
||||
"ERROR");
|
||||
EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), "ERROR");
|
||||
|
||||
const char *arg_bad_format[] = {"", "--bar_double="};
|
||||
EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1),
|
||||
"ERROR");
|
||||
EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1), "ERROR");
|
||||
}
|
||||
|
||||
TEST_F(CommandlineflagsTest, ParseStringFlags) {
|
||||
@ -113,8 +110,7 @@ TEST_F(CommandlineflagsTest, ParseStringFlags) {
|
||||
EXPECT_STREQ("def", FLAGS_bar_string.c_str());
|
||||
|
||||
const char *arg_no_value[] = {"", "--bar_string"};
|
||||
EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1),
|
||||
"ERROR");
|
||||
EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), "ERROR");
|
||||
|
||||
FLAGS_bar_string.set_value("bar");
|
||||
const char *arg_empty_string[] = {"", "--bar_string="};
|
||||
@ -145,8 +141,7 @@ TEST_F(CommandlineflagsTest, ParseBoolFlags) {
|
||||
EXPECT_TRUE(FLAGS_bar_bool);
|
||||
|
||||
const char *arg_missing_val[] = {"", "--bar_bool="};
|
||||
EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1),
|
||||
"ERROR");
|
||||
EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1), "ERROR");
|
||||
}
|
||||
|
||||
TEST_F(CommandlineflagsTest, ParseOldFlags) {
|
||||
@ -155,4 +150,4 @@ TEST_F(CommandlineflagsTest, ParseOldFlags) {
|
||||
TestParser(countof(argv), argv);
|
||||
EXPECT_STREQ("text", FLAGS_q.c_str());
|
||||
}
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -45,7 +45,9 @@ public:
|
||||
running_ = false;
|
||||
}
|
||||
}
|
||||
int64_t GetInMs() const { return GetNanos() / 1000000; }
|
||||
int64_t GetInMs() const {
|
||||
return GetNanos() / 1000000;
|
||||
}
|
||||
|
||||
protected:
|
||||
int64_t GetNanos() const {
|
||||
|
@ -12,15 +12,15 @@
|
||||
#include "include_gunit.h"
|
||||
|
||||
#include "ratngs.h"
|
||||
#include "unicharset.h"
|
||||
#include "trie.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <cstdlib> // for system
|
||||
#include <fstream> // for ifstream
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#ifndef SW_TESTING
|
||||
# define wordlist2dawg_prog "wordlist2dawg"
|
||||
@ -59,10 +59,9 @@ class DawgTest : public testing::Test {
|
||||
std::string OutputNameToPath(const std::string &name) const {
|
||||
return file::JoinPath(FLAGS_test_tmpdir, name);
|
||||
}
|
||||
int RunCommand(const std::string& program, const std::string& arg1,
|
||||
const std::string& arg2, const std::string& arg3) const {
|
||||
std::string cmdline =
|
||||
TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
|
||||
int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2,
|
||||
const std::string &arg3) const {
|
||||
std::string cmdline = TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
|
||||
return system(cmdline.c_str());
|
||||
}
|
||||
// Test that we are able to convert a wordlist file (one "word" per line) to
|
||||
@ -76,11 +75,8 @@ class DawgTest : public testing::Test {
|
||||
std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
|
||||
std::string output_wordlist = OutputNameToPath(wordlist_filename);
|
||||
LoadWordlist(orig_wordlist, &orig_words);
|
||||
EXPECT_EQ(
|
||||
RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
|
||||
EXPECT_EQ(
|
||||
RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist),
|
||||
0);
|
||||
EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
|
||||
EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0);
|
||||
LoadWordlist(output_wordlist, &roundtrip_words);
|
||||
EXPECT_EQ(orig_words, roundtrip_words);
|
||||
}
|
||||
@ -93,8 +89,7 @@ TEST_F(DawgTest, TestDawgConversion) {
|
||||
TEST_F(DawgTest, TestMatching) {
|
||||
UNICHARSET unicharset;
|
||||
unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
|
||||
tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM,
|
||||
unicharset.size(), 0);
|
||||
tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, unicharset.size(), 0);
|
||||
WERD_CHOICE space_apos(" '", unicharset);
|
||||
trie.add_word_to_dawg(space_apos);
|
||||
|
||||
@ -112,4 +107,4 @@ TEST_F(DawgTest, TestMatching) {
|
||||
EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -25,8 +25,8 @@ class DENORMTest : public testing::Test {
|
||||
public:
|
||||
void TearDown() {}
|
||||
|
||||
void ExpectCorrectTransform(const DENORM& denorm, const TPOINT& src,
|
||||
const TPOINT& result, bool local) {
|
||||
void ExpectCorrectTransform(const DENORM &denorm, const TPOINT &src, const TPOINT &result,
|
||||
bool local) {
|
||||
TPOINT normed;
|
||||
if (local)
|
||||
denorm.LocalNormTransform(src, &normed);
|
||||
@ -48,8 +48,8 @@ class DENORMTest : public testing::Test {
|
||||
// Tests a simple baseline-style normalization.
|
||||
TEST_F(DENORMTest, NoRotations) {
|
||||
DENORM denorm;
|
||||
denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f,
|
||||
0.0f, static_cast<float>(kBlnBaselineOffset));
|
||||
denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f,
|
||||
static_cast<float>(kBlnBaselineOffset));
|
||||
TPOINT pt1(1100, 2000);
|
||||
TPOINT result1(200, kBlnBaselineOffset);
|
||||
ExpectCorrectTransform(denorm, pt1, result1, true);
|
||||
@ -64,8 +64,8 @@ TEST_F(DENORMTest, NoRotations) {
|
||||
TEST_F(DENORMTest, WithRotations) {
|
||||
DENORM denorm;
|
||||
FCOORD rotation90(0.0f, 1.0f);
|
||||
denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f,
|
||||
3.0f, 0.0f, static_cast<float>(kBlnBaselineOffset));
|
||||
denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f,
|
||||
static_cast<float>(kBlnBaselineOffset));
|
||||
|
||||
TPOINT pt1(1100, 2000);
|
||||
TPOINT result1(0, 200 + kBlnBaselineOffset);
|
||||
@ -80,13 +80,13 @@ TEST_F(DENORMTest, WithRotations) {
|
||||
// Tests a simple baseline-style normalization with a second rotation & scale.
|
||||
TEST_F(DENORMTest, Multiple) {
|
||||
DENORM denorm;
|
||||
denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f,
|
||||
0.0f, static_cast<float>(kBlnBaselineOffset));
|
||||
denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, 0.0f,
|
||||
static_cast<float>(kBlnBaselineOffset));
|
||||
|
||||
DENORM denorm2;
|
||||
FCOORD rotation90(0.0f, 1.0f);
|
||||
denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f,
|
||||
0.25f, 0.0f, 0.0f);
|
||||
denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f, 0.25f, 0.0f,
|
||||
0.0f);
|
||||
TPOINT pt1(1050, 2000);
|
||||
TPOINT result1(100, kBlnBaselineOffset);
|
||||
ExpectCorrectTransform(denorm, pt1, result1, true);
|
||||
@ -96,4 +96,4 @@ TEST_F(DENORMTest, Multiple) {
|
||||
ExpectCorrectTransform(denorm2, pt1, result2, false);
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -38,8 +38,8 @@ class TestableEquationDetect : public EquationDetect {
|
||||
}
|
||||
|
||||
// Insert a certain math and digit blobs into part.
|
||||
void AddMathDigitBlobs(const int math_blobs, const int digit_blobs,
|
||||
const int total_blobs, ColPartition* part) {
|
||||
void AddMathDigitBlobs(const int math_blobs, const int digit_blobs, const int total_blobs,
|
||||
ColPartition *part) {
|
||||
CHECK(part != nullptr);
|
||||
CHECK_LE(math_blobs + digit_blobs, total_blobs);
|
||||
int count = 0;
|
||||
@ -75,8 +75,7 @@ class TestableEquationDetect : public EquationDetect {
|
||||
return EstimateTypeForUnichar(unicharset, unicharset.unichar_to_id(val));
|
||||
}
|
||||
|
||||
EquationDetect::IndentType RunIsIndented(ColPartitionGrid* part_grid,
|
||||
ColPartition* part) {
|
||||
EquationDetect::IndentType RunIsIndented(ColPartitionGrid *part_grid, ColPartition *part) {
|
||||
this->part_grid_ = part_grid;
|
||||
return IsIndented(part);
|
||||
}
|
||||
@ -97,13 +96,11 @@ class TestableEquationDetect : public EquationDetect {
|
||||
return CountAlignment(sorted_vec, val);
|
||||
}
|
||||
|
||||
void RunSplitCPHorLite(ColPartition* part,
|
||||
GenericVector<TBOX>* splitted_boxes) {
|
||||
void RunSplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
|
||||
SplitCPHorLite(part, splitted_boxes);
|
||||
}
|
||||
|
||||
void RunSplitCPHor(ColPartition* part,
|
||||
GenericVector<ColPartition*>* parts_splitted) {
|
||||
void RunSplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
|
||||
SplitCPHor(part, parts_splitted);
|
||||
}
|
||||
|
||||
@ -128,8 +125,7 @@ class EquationFinderTest : public testing::Test {
|
||||
tesseract_.reset(new Tesseract());
|
||||
tesseract_->init_tesseract(TESSDATA_DIR, "eng", OEM_TESSERACT_ONLY);
|
||||
tesseract_->set_source_resolution(300);
|
||||
equation_det_.reset(
|
||||
new TestableEquationDetect(TESSDATA_DIR, tesseract_.get()));
|
||||
equation_det_.reset(new TestableEquationDetect(TESSDATA_DIR, tesseract_.get()));
|
||||
equation_det_->SetResolution(300);
|
||||
|
||||
testdata_dir_ = TESTDATA_DIR;
|
||||
@ -145,14 +141,12 @@ class EquationFinderTest : public testing::Test {
|
||||
CHECK(pix != nullptr);
|
||||
CHECK(blocks != nullptr);
|
||||
BLOCK_IT block_it(blocks);
|
||||
BLOCK* block =
|
||||
new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix));
|
||||
BLOCK *block = new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix));
|
||||
block_it.add_to_end(block);
|
||||
}
|
||||
|
||||
// Create col partitions, add into part_grid, and put them into all_parts.
|
||||
void CreateColParts(const int rows, const int cols,
|
||||
ColPartitionGrid* part_grid,
|
||||
void CreateColParts(const int rows, const int cols, ColPartitionGrid *part_grid,
|
||||
std::vector<ColPartition *> *all_parts) {
|
||||
const int kWidth = 10, kHeight = 10;
|
||||
ClearParts(all_parts);
|
||||
@ -160,8 +154,7 @@ class EquationFinderTest : public testing::Test {
|
||||
for (int x = 0; x < cols; ++x) {
|
||||
int left = x * kWidth * 2, bottom = y * kHeight * 2;
|
||||
TBOX box(left, bottom, left + kWidth, bottom + kHeight);
|
||||
ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT,
|
||||
BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part_grid->InsertBBox(true, true, part);
|
||||
all_parts->push_back(part);
|
||||
}
|
||||
@ -203,8 +196,7 @@ TEST_F(EquationFinderTest, IdentifySpecialText) {
|
||||
// Identify special texts from to_blocks.
|
||||
TO_BLOCK_IT to_block_it(&to_blocks);
|
||||
std::map<int, int> stt_count;
|
||||
for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list();
|
||||
to_block_it.forward()) {
|
||||
for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); to_block_it.forward()) {
|
||||
TO_BLOCK *to_block = to_block_it.data();
|
||||
BLOBNBOX_IT blob_it(&(to_block->blobs));
|
||||
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
|
||||
@ -266,42 +258,32 @@ TEST_F(EquationFinderTest, IsIndented) {
|
||||
//
|
||||
// part 5: ********
|
||||
TBOX box1(0, 950, 999, 999);
|
||||
ColPartition* part1 =
|
||||
ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part1 = ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part_grid.InsertBBox(true, true, part1);
|
||||
TBOX box2(300, 920, 900, 940);
|
||||
ColPartition* part2 =
|
||||
ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part2 = ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part_grid.InsertBBox(true, true, part2);
|
||||
TBOX box3(0, 900, 600, 910);
|
||||
ColPartition* part3 =
|
||||
ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part3 = ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part_grid.InsertBBox(true, true, part3);
|
||||
TBOX box4(300, 890, 600, 899);
|
||||
ColPartition* part4 =
|
||||
ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part4 = ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part_grid.InsertBBox(true, true, part4);
|
||||
TBOX box5(300, 500, 900, 510);
|
||||
ColPartition* part5 =
|
||||
ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part5 = ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part_grid.InsertBBox(true, true, part5);
|
||||
|
||||
// Test
|
||||
// part1 should be no indent.
|
||||
EXPECT_EQ(EquationDetect::NO_INDENT,
|
||||
equation_det_->RunIsIndented(&part_grid, part1));
|
||||
EXPECT_EQ(EquationDetect::NO_INDENT, equation_det_->RunIsIndented(&part_grid, part1));
|
||||
// part2 should be left indent in terms of part1.
|
||||
EXPECT_EQ(EquationDetect::LEFT_INDENT,
|
||||
equation_det_->RunIsIndented(&part_grid, part2));
|
||||
EXPECT_EQ(EquationDetect::LEFT_INDENT, equation_det_->RunIsIndented(&part_grid, part2));
|
||||
// part3 should be right indent.
|
||||
EXPECT_EQ(EquationDetect::RIGHT_INDENT,
|
||||
equation_det_->RunIsIndented(&part_grid, part3));
|
||||
EXPECT_EQ(EquationDetect::RIGHT_INDENT, equation_det_->RunIsIndented(&part_grid, part3));
|
||||
// part4 should be both indented.
|
||||
EXPECT_EQ(EquationDetect::BOTH_INDENT,
|
||||
equation_det_->RunIsIndented(&part_grid, part4));
|
||||
EXPECT_EQ(EquationDetect::BOTH_INDENT, equation_det_->RunIsIndented(&part_grid, part4));
|
||||
// part5 should be no indent because it is too far from part1.
|
||||
EXPECT_EQ(EquationDetect::NO_INDENT,
|
||||
equation_det_->RunIsIndented(&part_grid, part5));
|
||||
EXPECT_EQ(EquationDetect::NO_INDENT, equation_det_->RunIsIndented(&part_grid, part5));
|
||||
|
||||
// Release memory.
|
||||
part1->DeleteBoxes();
|
||||
@ -347,14 +329,10 @@ TEST_F(EquationFinderTest, IsNearSmallNeighbor) {
|
||||
|
||||
TEST_F(EquationFinderTest, CheckSeedBlobsCount) {
|
||||
TBOX box(0, 950, 999, 999);
|
||||
ColPartition* part1 =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition* part2 =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition* part3 =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition* part4 =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part1 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part2 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part3 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part4 = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
|
||||
// Part 1: 8 math, 0 digit, 20 total.
|
||||
equation_det_->AddMathDigitBlobs(8, 0, 20, part1);
|
||||
@ -429,20 +407,15 @@ TEST_F(EquationFinderTest, ComputeCPsSuperBBox) {
|
||||
ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));
|
||||
|
||||
TBOX box1(0, 0, 999, 99);
|
||||
ColPartition* part1 =
|
||||
ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part1 = ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
TBOX box2(0, 100, 499, 199);
|
||||
ColPartition* part2 =
|
||||
ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part2 = ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
TBOX box3(500, 100, 999, 199);
|
||||
ColPartition* part3 =
|
||||
ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part3 = ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
TBOX box4(0, 200, 999, 299);
|
||||
ColPartition* part4 =
|
||||
ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part4 = ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
TBOX box5(0, 900, 999, 999);
|
||||
ColPartition* part5 =
|
||||
ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part5 = ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
|
||||
// Add part1->part3 into part_grid and test.
|
||||
part_grid.InsertBBox(true, true, part1);
|
||||
@ -476,8 +449,7 @@ TEST_F(EquationFinderTest, ComputeCPsSuperBBox) {
|
||||
|
||||
TEST_F(EquationFinderTest, SplitCPHorLite) {
|
||||
TBOX box(0, 0, 999, 99);
|
||||
ColPartition* part =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->DeleteBoxes();
|
||||
part->set_median_width(10);
|
||||
GenericVector<TBOX> splitted_boxes;
|
||||
@ -511,8 +483,7 @@ TEST_F(EquationFinderTest, SplitCPHorLite) {
|
||||
|
||||
TEST_F(EquationFinderTest, SplitCPHor) {
|
||||
TBOX box(0, 0, 999, 99);
|
||||
ColPartition* part =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->DeleteBoxes();
|
||||
part->set_median_width(10);
|
||||
GenericVector<ColPartition *> parts_splitted;
|
||||
|
@ -9,7 +9,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
|
||||
@ -29,7 +28,8 @@ TEST(FileTest, JoinPath) {
|
||||
TEST(OutputBufferTest, WriteString) {
|
||||
const int kMaxBufSize = 128;
|
||||
char buffer[kMaxBufSize];
|
||||
for (int i = 0; i < kMaxBufSize; ++i) buffer[i] = '\0';
|
||||
for (int i = 0; i < kMaxBufSize; ++i)
|
||||
buffer[i] = '\0';
|
||||
FILE *fp = tmpfile();
|
||||
CHECK(fp != nullptr);
|
||||
|
||||
@ -63,4 +63,4 @@ TEST(InputBufferTest, Read) {
|
||||
EXPECT_EQ(" world!", lines[1]);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
|
||||
#include <libgen.h> // for dirname
|
||||
#include <cstdio> // for printf
|
||||
@ -21,8 +21,7 @@ class BitReader {
|
||||
size_t shift;
|
||||
|
||||
public:
|
||||
BitReader(const uint8_t* data, size_t size)
|
||||
: data(data), size(size), shift(0) {}
|
||||
BitReader(const uint8_t *data, size_t size) : data(data), size(size), shift(0) {}
|
||||
|
||||
int Read(void) {
|
||||
if (size == 0) {
|
||||
|
@ -9,7 +9,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "include_gunit.h"
|
||||
|
||||
#include "doubleptr.h"
|
||||
|
@ -33,8 +33,7 @@ class ImagedataTest : public ::testing::Test {
|
||||
ImagedataTest() {}
|
||||
|
||||
// Creates a fake DocumentData, writes it to a file, and returns the filename.
|
||||
std::string MakeFakeDoc(int num_pages, unsigned doc_id,
|
||||
std::vector<std::string>* page_texts) {
|
||||
std::string MakeFakeDoc(int num_pages, unsigned doc_id, std::vector<std::string> *page_texts) {
|
||||
// The size of the fake images that we will use.
|
||||
const int kImageSize = 1048576;
|
||||
// Not using a real image here - just an array of zeros! We are just testing
|
||||
@ -43,18 +42,16 @@ class ImagedataTest : public ::testing::Test {
|
||||
DocumentData write_doc("My document");
|
||||
for (int p = 0; p < num_pages; ++p) {
|
||||
// Make some fake text that is different for each page and save it.
|
||||
page_texts->push_back(
|
||||
absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id));
|
||||
page_texts->push_back(absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id));
|
||||
// Make an imagedata and put it in the document.
|
||||
ImageData* imagedata =
|
||||
ImageData::Build("noname", p, "eng", fake_image.data(),
|
||||
ImageData *imagedata = ImageData::Build("noname", p, "eng", fake_image.data(),
|
||||
fake_image.size(), (*page_texts)[p].c_str(), nullptr);
|
||||
EXPECT_EQ(kImageSize, imagedata->MemoryUsed());
|
||||
write_doc.AddPageToDocument(imagedata);
|
||||
}
|
||||
// Write it to a file.
|
||||
std::string filename = file::JoinPath(
|
||||
FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf"));
|
||||
std::string filename =
|
||||
file::JoinPath(FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf"));
|
||||
EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr));
|
||||
return filename;
|
||||
}
|
||||
@ -76,8 +73,7 @@ TEST_F(ImagedataTest, CachesProperly) {
|
||||
// the pages can still be read.
|
||||
for (int m = 0; kMemoryAllowances[m] > 0; ++m) {
|
||||
DocumentData read_doc("My document");
|
||||
EXPECT_TRUE(
|
||||
read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr));
|
||||
EXPECT_TRUE(read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr));
|
||||
LOG(ERROR) << "Allowance = " << kMemoryAllowances[m];
|
||||
// Read the pages in a specific order.
|
||||
for (int p = 0; kPageReadOrder[p] >= 0; ++p) {
|
||||
@ -86,8 +82,7 @@ TEST_F(ImagedataTest, CachesProperly) {
|
||||
EXPECT_NE(nullptr, imagedata);
|
||||
// EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata);
|
||||
// Check that this is the right page.
|
||||
EXPECT_STREQ(page_texts[page].c_str(),
|
||||
imagedata->transcription().c_str());
|
||||
EXPECT_STREQ(page_texts[page].c_str(), imagedata->transcription().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -119,13 +114,11 @@ TEST_F(ImagedataTest, CachesMultiDocs) {
|
||||
int robin_doc = p % kNumPages.size();
|
||||
int robin_page = p / kNumPages.size() % kNumPages[robin_doc];
|
||||
// Check that this is the right page.
|
||||
EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(),
|
||||
robin_data->transcription().c_str());
|
||||
EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(), robin_data->transcription().c_str());
|
||||
int serial_doc = p / kNumPages[0] % kNumPages.size();
|
||||
int serial_page = p % kNumPages[0] % kNumPages[serial_doc];
|
||||
EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(),
|
||||
serial_data->transcription().c_str());
|
||||
EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(), serial_data->transcription().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -15,14 +15,13 @@
|
||||
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "fileio.h" // for tesseract::File
|
||||
#include "log.h" // for LOG
|
||||
#include "gtest/gtest.h"
|
||||
#include "log.h" // for LOG
|
||||
|
||||
const char *FLAGS_test_tmpdir = "./tmp";
|
||||
|
||||
class file : public tesseract::File {
|
||||
public:
|
||||
|
||||
static void MakeTmpdir() {
|
||||
#if defined(_WIN32)
|
||||
_mkdir(FLAGS_test_tmpdir);
|
||||
@ -41,7 +40,8 @@ public:
|
||||
return File::ReadFileToString(filename, out);
|
||||
}
|
||||
|
||||
static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) {
|
||||
static bool SetContents(const std::string &name, const std::string &contents,
|
||||
bool /*is_default*/) {
|
||||
return WriteStringToFile(contents, name);
|
||||
}
|
||||
|
||||
@ -53,8 +53,7 @@ public:
|
||||
return tesseract::File::JoinPath(s1, s2);
|
||||
}
|
||||
|
||||
static std::string JoinPath(const std::string& s1, const std::string& s2,
|
||||
const std::string& s3) {
|
||||
static std::string JoinPath(const std::string &s1, const std::string &s2, const std::string &s3) {
|
||||
return JoinPath(JoinPath(s1, s2), s3);
|
||||
}
|
||||
};
|
||||
|
@ -37,11 +37,13 @@ class IndexMapBiDiTest : public testing::Test {
|
||||
map->Init(kPrimeLimit + 1, false);
|
||||
map->SetMap(2, true);
|
||||
// Set all the odds to true.
|
||||
for (int i = 3; i <= kPrimeLimit; i += 2) map->SetMap(i, true);
|
||||
for (int i = 3; i <= kPrimeLimit; i += 2)
|
||||
map->SetMap(i, true);
|
||||
int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));
|
||||
for (int f = 3; f <= factor_limit; f += 2) {
|
||||
if (map->SparseToCompact(f) >= 0) {
|
||||
for (int m = 2; m * f <= kPrimeLimit; ++m) map->SetMap(f * m, false);
|
||||
for (int m = 2; m * f <= kPrimeLimit; ++m)
|
||||
map->SetMap(f * m, false);
|
||||
}
|
||||
}
|
||||
map->Setup();
|
||||
@ -114,4 +116,4 @@ TEST_F(IndexMapBiDiTest, ManyToOne) {
|
||||
EXPECT_EQ(1, map.SparseToCompact(11));
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -49,8 +49,7 @@ TEST_F(IntFeatureMapTest, Exhaustive) {
|
||||
IntFeatureMap map;
|
||||
map.Init(space);
|
||||
int total_size = kIntFeatureExtent * kIntFeatureExtent * kIntFeatureExtent;
|
||||
std::unique_ptr<INT_FEATURE_STRUCT[]> features(
|
||||
new INT_FEATURE_STRUCT[total_size]);
|
||||
std::unique_ptr<INT_FEATURE_STRUCT[]> features(new INT_FEATURE_STRUCT[total_size]);
|
||||
// Fill the features with every value.
|
||||
for (int y = 0; y < kIntFeatureExtent; ++y) {
|
||||
for (int x = 0; x < kIntFeatureExtent; ++x) {
|
||||
@ -80,8 +79,7 @@ TEST_F(IntFeatureMapTest, Exhaustive) {
|
||||
int dtheta = kIntFeatureExtent / kThetaBuckets + 1;
|
||||
int bad_offsets = 0;
|
||||
for (int index = 0; index < total_buckets; ++index) {
|
||||
for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps;
|
||||
++dir) {
|
||||
for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps; ++dir) {
|
||||
int offset_index = map.OffsetFeature(index, dir);
|
||||
if (dir == 0) {
|
||||
EXPECT_EQ(index, offset_index);
|
||||
@ -112,11 +110,9 @@ TEST_F(IntFeatureMapTest, Exhaustive) {
|
||||
// Has no effect on index features.
|
||||
EXPECT_EQ(total_size, index_features.size());
|
||||
misses = map.MapIndexedFeatures(index_features, &map_features);
|
||||
int expected_misses = (kIntFeatureExtent / kXBuckets) *
|
||||
(kIntFeatureExtent / kYBuckets) *
|
||||
int expected_misses = (kIntFeatureExtent / kXBuckets) * (kIntFeatureExtent / kYBuckets) *
|
||||
(kIntFeatureExtent / kThetaBuckets + 1);
|
||||
expected_misses += (kIntFeatureExtent / kXBuckets) *
|
||||
(kIntFeatureExtent / kYBuckets + 1) *
|
||||
expected_misses += (kIntFeatureExtent / kXBuckets) * (kIntFeatureExtent / kYBuckets + 1) *
|
||||
(kIntFeatureExtent / kThetaBuckets);
|
||||
EXPECT_EQ(expected_misses, misses);
|
||||
EXPECT_EQ(total_buckets - 2, map_features.size());
|
||||
@ -126,4 +122,4 @@ TEST_F(IntFeatureMapTest, Exhaustive) {
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -15,10 +15,10 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "intsimdmatrix.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <gtest/gtest.h>
|
||||
#include <gtest/internal/gtest-port.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "include_gunit.h"
|
||||
#include "matrix.h"
|
||||
#include "simddetect.h"
|
||||
@ -80,8 +80,8 @@ class IntSimdMatrixTest : public ::testing::Test {
|
||||
matrix.Init(w, shaped_wi, rounded_num_out);
|
||||
scales.reserve(rounded_num_out);
|
||||
if (matrix.matrixDotVectorFunction) {
|
||||
matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0],
|
||||
&scales[0], &u[0], &test_result[0]);
|
||||
matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0], &scales[0], &u[0],
|
||||
&test_result[0]);
|
||||
} else {
|
||||
IntSimdMatrix::MatrixDotVector(w, scales, u.data(), test_result.data());
|
||||
}
|
||||
|
@ -48,22 +48,19 @@ TEST(LangModelTest, AddACharacter) {
|
||||
bool pass_through_recoder = false;
|
||||
std::vector<STRING> words, puncs, numbers;
|
||||
// If these reads fail, we get a warning message and an empty list of words.
|
||||
ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr)
|
||||
.split('\n', &words);
|
||||
ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr).split('\n', &words);
|
||||
EXPECT_GT(words.size(), 0);
|
||||
ReadFile(file::JoinPath(eng_dir, "eng.punc"), nullptr).split('\n', &puncs);
|
||||
EXPECT_GT(puncs.size(), 0);
|
||||
ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr)
|
||||
.split('\n', &numbers);
|
||||
ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr).split('\n', &numbers);
|
||||
EXPECT_GT(numbers.size(), 0);
|
||||
bool lang_is_rtl = false;
|
||||
// Generate the traineddata file.
|
||||
EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir,
|
||||
lang1, pass_through_recoder, words, puncs,
|
||||
numbers, lang_is_rtl, nullptr, nullptr));
|
||||
EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
|
||||
pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,
|
||||
nullptr));
|
||||
// Init a trainer with it, and encode kTestString.
|
||||
std::string traineddata1 =
|
||||
file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
|
||||
std::string traineddata1 = file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
|
||||
LSTMTrainer trainer1;
|
||||
trainer1.InitCharSet(traineddata1);
|
||||
std::vector<int> labels1;
|
||||
@ -75,18 +72,15 @@ TEST(LangModelTest, AddACharacter) {
|
||||
// Add a new character to the unicharset and try again.
|
||||
int size_before = unicharset.size();
|
||||
unicharset.unichar_insert("₹");
|
||||
SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false,
|
||||
&unicharset);
|
||||
SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);
|
||||
EXPECT_EQ(size_before + 1, unicharset.size());
|
||||
// Generate the traineddata file.
|
||||
std::string lang2 = "extended";
|
||||
EXPECT_EQ(EXIT_SUCCESS,
|
||||
CombineLangModel(unicharset, script_dir, version_str, output_dir,
|
||||
lang2, pass_through_recoder, words, puncs, numbers,
|
||||
lang_is_rtl, nullptr, nullptr));
|
||||
EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
|
||||
pass_through_recoder, words, puncs, numbers, lang_is_rtl,
|
||||
nullptr, nullptr));
|
||||
// Init a trainer with it, and encode kTestString.
|
||||
std::string traineddata2 =
|
||||
file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
|
||||
std::string traineddata2 = file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
|
||||
LSTMTrainer trainer2;
|
||||
trainer2.InitCharSet(traineddata2);
|
||||
std::vector<int> labels2;
|
||||
@ -114,8 +108,7 @@ TEST(LangModelTest, AddACharacter) {
|
||||
else
|
||||
labels1_v[i] = labels1[i];
|
||||
}
|
||||
EXPECT_THAT(labels1_v,
|
||||
testing::ElementsAreArray(&labels2[0], labels2.size()));
|
||||
EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
|
||||
// To make sure we we are not cheating somehow, we can now encode the Rupee
|
||||
// symbol, which we could not do before.
|
||||
EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
|
||||
@ -140,22 +133,19 @@ TEST(LangModelTest, AddACharacterHindi) {
|
||||
bool pass_through_recoder = false;
|
||||
std::vector<STRING> words, puncs, numbers;
|
||||
// If these reads fail, we get a warning message and an empty list of words.
|
||||
ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr)
|
||||
.split('\n', &words);
|
||||
ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr).split('\n', &words);
|
||||
EXPECT_GT(words.size(), 0);
|
||||
ReadFile(file::JoinPath(hin_dir, "hin.punc"), nullptr).split('\n', &puncs);
|
||||
EXPECT_GT(puncs.size(), 0);
|
||||
ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr)
|
||||
.split('\n', &numbers);
|
||||
ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr).split('\n', &numbers);
|
||||
EXPECT_GT(numbers.size(), 0);
|
||||
bool lang_is_rtl = false;
|
||||
// Generate the traineddata file.
|
||||
EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir,
|
||||
lang1, pass_through_recoder, words, puncs,
|
||||
numbers, lang_is_rtl, nullptr, nullptr));
|
||||
EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang1,
|
||||
pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr,
|
||||
nullptr));
|
||||
// Init a trainer with it, and encode kTestString.
|
||||
std::string traineddata1 =
|
||||
file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
|
||||
std::string traineddata1 = file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
|
||||
LSTMTrainer trainer1;
|
||||
trainer1.InitCharSet(traineddata1);
|
||||
std::vector<int> labels1;
|
||||
@ -167,18 +157,15 @@ TEST(LangModelTest, AddACharacterHindi) {
|
||||
// Add a new character to the unicharset and try again.
|
||||
int size_before = unicharset.size();
|
||||
unicharset.unichar_insert("₹");
|
||||
SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false,
|
||||
&unicharset);
|
||||
SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset);
|
||||
EXPECT_EQ(size_before + 1, unicharset.size());
|
||||
// Generate the traineddata file.
|
||||
std::string lang2 = "extendedhin";
|
||||
EXPECT_EQ(EXIT_SUCCESS,
|
||||
CombineLangModel(unicharset, script_dir, version_str, output_dir,
|
||||
lang2, pass_through_recoder, words, puncs, numbers,
|
||||
lang_is_rtl, nullptr, nullptr));
|
||||
EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2,
|
||||
pass_through_recoder, words, puncs, numbers, lang_is_rtl,
|
||||
nullptr, nullptr));
|
||||
// Init a trainer with it, and encode kTestString.
|
||||
std::string traineddata2 =
|
||||
file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
|
||||
std::string traineddata2 = file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
|
||||
LSTMTrainer trainer2;
|
||||
trainer2.InitCharSet(traineddata2);
|
||||
std::vector<int> labels2;
|
||||
@ -206,8 +193,7 @@ TEST(LangModelTest, AddACharacterHindi) {
|
||||
else
|
||||
labels1_v[i] = labels1[i];
|
||||
}
|
||||
EXPECT_THAT(labels1_v,
|
||||
testing::ElementsAreArray(&labels2[0], labels2.size()));
|
||||
EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size()));
|
||||
// To make sure we we are not cheating somehow, we can now encode the Rupee
|
||||
// symbol, which we could not do before.
|
||||
EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
|
||||
|
@ -16,13 +16,13 @@
|
||||
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/resultiterator.h>
|
||||
#include "coutln.h"
|
||||
#include "log.h" // for LOG
|
||||
#include "mutableiterator.h"
|
||||
#include "ocrblock.h" // for class BLOCK
|
||||
#include "pageres.h"
|
||||
#include "polyblk.h"
|
||||
#include <tesseract/resultiterator.h>
|
||||
#include "stepblob.h"
|
||||
|
||||
namespace tesseract {
|
||||
@ -47,11 +47,9 @@ static const char* kPolyBlockNames[] = {
|
||||
"" // End marker for testing that sizes match.
|
||||
};
|
||||
|
||||
const char* kStrings8087_054[] = {
|
||||
"dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr};
|
||||
const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT,
|
||||
PT_PULLOUT_IMAGE, PT_CAPTION_TEXT,
|
||||
PT_FLOWING_TEXT};
|
||||
const char *kStrings8087_054[] = {"dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr};
|
||||
const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT, PT_PULLOUT_IMAGE,
|
||||
PT_CAPTION_TEXT, PT_FLOWING_TEXT};
|
||||
|
||||
// The fixture for testing Tesseract.
|
||||
class LayoutTest : public testing::Test {
|
||||
@ -63,8 +61,12 @@ class LayoutTest : public testing::Test {
|
||||
return file::JoinPath(TESSDATA_DIR, "");
|
||||
}
|
||||
|
||||
LayoutTest() { src_pix_ = nullptr; }
|
||||
~LayoutTest() { pixDestroy(&src_pix_); }
|
||||
LayoutTest() {
|
||||
src_pix_ = nullptr;
|
||||
}
|
||||
~LayoutTest() {
|
||||
pixDestroy(&src_pix_);
|
||||
}
|
||||
|
||||
void SetImage(const char *filename, const char *lang) {
|
||||
pixDestroy(&src_pix_);
|
||||
@ -88,15 +90,14 @@ class LayoutTest : public testing::Test {
|
||||
char *block_text = it->GetUTF8Text(tesseract::RIL_BLOCK);
|
||||
if (block_text != nullptr && it->BlockType() == blocks[string_index] &&
|
||||
strstr(block_text, strings[string_index]) != nullptr) {
|
||||
LOG(INFO) << "Found string " << strings[string_index]
|
||||
<< " in block " << block_index
|
||||
LOG(INFO) << "Found string " << strings[string_index] << " in block " << block_index
|
||||
<< " of type " << kPolyBlockNames[blocks[string_index]] << "\n";
|
||||
// Found this one.
|
||||
++string_index;
|
||||
} else if (it->BlockType() == blocks[string_index] &&
|
||||
block_text == nullptr && strings[string_index][0] == '\0') {
|
||||
LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]]
|
||||
<< " at block " << block_index << "\n";
|
||||
} else if (it->BlockType() == blocks[string_index] && block_text == nullptr &&
|
||||
strings[string_index][0] == '\0') {
|
||||
LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]] << " at block "
|
||||
<< block_index << "\n";
|
||||
// Found this one.
|
||||
++string_index;
|
||||
} else {
|
||||
@ -104,7 +105,8 @@ class LayoutTest : public testing::Test {
|
||||
}
|
||||
delete[] block_text;
|
||||
++block_index;
|
||||
if (strings[string_index] == nullptr) break;
|
||||
if (strings[string_index] == nullptr)
|
||||
break;
|
||||
} while (it->Next(tesseract::RIL_BLOCK));
|
||||
EXPECT_TRUE(strings[string_index] == nullptr);
|
||||
}
|
||||
@ -122,8 +124,7 @@ class LayoutTest : public testing::Test {
|
||||
do {
|
||||
int left, top, right, bottom;
|
||||
if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&
|
||||
PTIsTextType(it->BlockType()) && right - left > 800 &&
|
||||
bottom - top > 200) {
|
||||
PTIsTextType(it->BlockType()) && right - left > 800 && bottom - top > 200) {
|
||||
if (prev_right > prev_left) {
|
||||
if (std::min(right, prev_right) > std::max(left, prev_left)) {
|
||||
EXPECT_GE(top, prev_bottom) << "Overlapping block should be below";
|
||||
@ -150,8 +151,7 @@ class LayoutTest : public testing::Test {
|
||||
do {
|
||||
int left, top, right, bottom;
|
||||
if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&
|
||||
PTIsTextType(it->BlockType()) && right - left > 800 &&
|
||||
bottom - top > 200) {
|
||||
PTIsTextType(it->BlockType()) && right - left > 800 && bottom - top > 200) {
|
||||
const PAGE_RES_IT *pr_it = it->PageResIt();
|
||||
POLY_BLOCK *pb = pr_it->block()->block->pdblk.poly_block();
|
||||
CHECK(pb != nullptr);
|
||||
@ -171,8 +171,7 @@ class LayoutTest : public testing::Test {
|
||||
for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
|
||||
C_OUTLINE *ol = ol_it.data();
|
||||
TBOX box = ol->bounding_box();
|
||||
ICOORD middle((box.left() + box.right()) / 2,
|
||||
(box.top() + box.bottom()) / 2);
|
||||
ICOORD middle((box.left() + box.right()) / 2, (box.top() + box.bottom()) / 2);
|
||||
EXPECT_EQ(winding_target, pb->winding_number(middle));
|
||||
}
|
||||
}
|
||||
@ -231,4 +230,4 @@ TEST_F(LayoutTest, HebrewOrderingAndSkew) {
|
||||
delete it;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,10 +9,10 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "ligature_table.h"
|
||||
#include "commandlineflags.h"
|
||||
#include "fileio.h"
|
||||
#include "include_gunit.h"
|
||||
#include "ligature_table.h"
|
||||
#include "pango_font_info.h"
|
||||
|
||||
namespace tesseract {
|
||||
@ -55,8 +55,7 @@ TEST_F(LigatureTableTest, DoesFillLigatureTables) {
|
||||
}
|
||||
|
||||
TEST_F(LigatureTableTest, DoesAddLigatures) {
|
||||
EXPECT_STREQ(kEngLigatureText,
|
||||
lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str());
|
||||
EXPECT_STREQ(kEngLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str());
|
||||
}
|
||||
|
||||
TEST_F(LigatureTableTest, DoesAddLigaturesWithSupportedFont) {
|
||||
@ -71,13 +70,11 @@ printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
|
||||
TEST_F(LigatureTableTest, DoesNotAddLigaturesWithUnsupportedFont) {
|
||||
PangoFontInfo font;
|
||||
EXPECT_TRUE(font.ParseFontDescriptionName("Lohit Hindi"));
|
||||
EXPECT_STREQ(kEngNonLigatureText,
|
||||
lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
|
||||
EXPECT_STREQ(kEngNonLigatureText, lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
|
||||
}
|
||||
|
||||
TEST_F(LigatureTableTest, DoesRemoveLigatures) {
|
||||
EXPECT_STREQ(kEngNonLigatureText,
|
||||
lig_table_->RemoveLigatures(kEngLigatureText).c_str());
|
||||
EXPECT_STREQ(kEngNonLigatureText, lig_table_->RemoveLigatures(kEngLigatureText).c_str());
|
||||
}
|
||||
|
||||
TEST_F(LigatureTableTest, TestCustomLigatures) {
|
||||
@ -86,12 +83,9 @@ TEST_F(LigatureTableTest, TestCustomLigatures) {
|
||||
"\uE007nce", "aſleep", "a\uE008eep", "neceſſary", "nece\uE009ary",
|
||||
};
|
||||
for (size_t i = 0; i < countof(kTestCases); i += 2) {
|
||||
EXPECT_STREQ(kTestCases[i + 1],
|
||||
lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
|
||||
EXPECT_STREQ(kTestCases[i],
|
||||
lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str());
|
||||
EXPECT_STREQ(kTestCases[i],
|
||||
lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
|
||||
EXPECT_STREQ(kTestCases[i + 1], lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
|
||||
EXPECT_STREQ(kTestCases[i], lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str());
|
||||
EXPECT_STREQ(kTestCases[i], lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -102,10 +96,8 @@ TEST_F(LigatureTableTest, TestRemovesCustomLigatures) {
|
||||
"fiction",
|
||||
};
|
||||
for (size_t i = 0; i < countof(kTestCases); i += 3) {
|
||||
EXPECT_STREQ(kTestCases[i + 1],
|
||||
lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
|
||||
EXPECT_STREQ(kTestCases[i + 2],
|
||||
lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
|
||||
EXPECT_STREQ(kTestCases[i + 1], lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
|
||||
EXPECT_STREQ(kTestCases[i + 2], lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -24,8 +24,8 @@ class LLSQTest : public testing::Test {
|
||||
public:
|
||||
void TearDown() {}
|
||||
|
||||
void ExpectCorrectLine(const LLSQ& llsq, double m, double c, double rms,
|
||||
double pearson, double tolerance) {
|
||||
void ExpectCorrectLine(const LLSQ &llsq, double m, double c, double rms, double pearson,
|
||||
double tolerance) {
|
||||
EXPECT_NEAR(m, llsq.m(), tolerance);
|
||||
EXPECT_NEAR(c, llsq.c(llsq.m()), tolerance);
|
||||
EXPECT_NEAR(rms, llsq.rms(llsq.m(), llsq.c(llsq.m())), tolerance);
|
||||
@ -53,8 +53,8 @@ class LLSQTest : public testing::Test {
|
||||
expected_answer = sqrt(expected_answer);
|
||||
EXPECT_NEAR(expected_answer, llsq.rms_orth(orth), 0.0001);
|
||||
}
|
||||
void ExpectCorrectVector(const LLSQ& llsq, FCOORD correct_mean_pt,
|
||||
FCOORD correct_vector, float tolerance) {
|
||||
void ExpectCorrectVector(const LLSQ &llsq, FCOORD correct_mean_pt, FCOORD correct_vector,
|
||||
float tolerance) {
|
||||
FCOORD mean_pt = llsq.mean_point();
|
||||
FCOORD vector = llsq.vector_fit();
|
||||
EXPECT_NEAR(correct_mean_pt.x(), mean_pt.x(), tolerance);
|
||||
@ -71,8 +71,7 @@ TEST_F(LLSQTest, BasicLines) {
|
||||
llsq.add(2.0, 2.0);
|
||||
ExpectCorrectLine(llsq, 1.0, 0.0, 0.0, 1.0, 1e-6);
|
||||
float half_root_2 = sqrt(2.0) / 2.0f;
|
||||
ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f),
|
||||
FCOORD(half_root_2, half_root_2), 1e-6);
|
||||
ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f), FCOORD(half_root_2, half_root_2), 1e-6);
|
||||
llsq.remove(2.0, 2.0);
|
||||
llsq.add(1.0, 2.0);
|
||||
llsq.add(10.0, 1.0);
|
||||
@ -115,4 +114,4 @@ TEST_F(LLSQTest, RmsOrthWorksAsIntended) {
|
||||
VerifyRmsOrth(pts, FCOORD(2, 1));
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -30,8 +30,7 @@ class ListTest : public ::testing::Test {
|
||||
|
||||
class Elst : public ELIST_LINK {
|
||||
public:
|
||||
Elst(unsigned n) : value(n) {
|
||||
}
|
||||
Elst(unsigned n) : value(n) {}
|
||||
unsigned value;
|
||||
};
|
||||
|
||||
|
@ -16,236 +16,556 @@
|
||||
// limitations under the License.
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <time.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <time.h>
|
||||
#include <memory> // std::unique_ptr
|
||||
#include "include_gunit.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class QuickTest : public testing::Test {
|
||||
protected:
|
||||
virtual void SetUp() { start_time_ = time(nullptr); }
|
||||
virtual void SetUp() {
|
||||
start_time_ = time(nullptr);
|
||||
}
|
||||
virtual void TearDown() {
|
||||
const time_t end_time = time(nullptr);
|
||||
EXPECT_TRUE(end_time - start_time_ <= 25)
|
||||
<< "The test took too long - "
|
||||
<< ::testing::PrintToString(end_time - start_time_);
|
||||
<< "The test took too long - " << ::testing::PrintToString(end_time - start_time_);
|
||||
}
|
||||
time_t start_time_;
|
||||
};
|
||||
|
||||
void LangLoader(const char *lang, const char *tessdatadir) {
|
||||
std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang))
|
||||
<< "Could not initialize tesseract for $lang.";
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract for $lang.";
|
||||
api->End();
|
||||
}
|
||||
|
||||
// For all languages
|
||||
|
||||
class LoadLanguage : public QuickTest,
|
||||
public ::testing::WithParamInterface<const char*> {};
|
||||
class LoadLanguage : public QuickTest, public ::testing::WithParamInterface<const char *> {};
|
||||
|
||||
TEST_P(LoadLanguage, afr) { LangLoader("afr", GetParam()); }
|
||||
TEST_P(LoadLanguage, amh) { LangLoader("amh", GetParam()); }
|
||||
TEST_P(LoadLanguage, ara) { LangLoader("ara", GetParam()); }
|
||||
TEST_P(LoadLanguage, asm) { LangLoader("asm", GetParam()); }
|
||||
TEST_P(LoadLanguage, aze) { LangLoader("aze", GetParam()); }
|
||||
TEST_P(LoadLanguage, aze_cyrl) { LangLoader("aze_cyrl", GetParam()); }
|
||||
TEST_P(LoadLanguage, bel) { LangLoader("bel", GetParam()); }
|
||||
TEST_P(LoadLanguage, ben) { LangLoader("ben", GetParam()); }
|
||||
TEST_P(LoadLanguage, bod) { LangLoader("bod", GetParam()); }
|
||||
TEST_P(LoadLanguage, bos) { LangLoader("bos", GetParam()); }
|
||||
TEST_P(LoadLanguage, bre) { LangLoader("bre", GetParam()); }
|
||||
TEST_P(LoadLanguage, bul) { LangLoader("bul", GetParam()); }
|
||||
TEST_P(LoadLanguage, cat) { LangLoader("cat", GetParam()); }
|
||||
TEST_P(LoadLanguage, ceb) { LangLoader("ceb", GetParam()); }
|
||||
TEST_P(LoadLanguage, ces) { LangLoader("ces", GetParam()); }
|
||||
TEST_P(LoadLanguage, chi_sim) { LangLoader("chi_sim", GetParam()); }
|
||||
TEST_P(LoadLanguage, chi_sim_vert) { LangLoader("chi_sim_vert", GetParam()); }
|
||||
TEST_P(LoadLanguage, chi_tra) { LangLoader("chi_tra", GetParam()); }
|
||||
TEST_P(LoadLanguage, chi_tra_vert) { LangLoader("chi_tra_vert", GetParam()); }
|
||||
TEST_P(LoadLanguage, chr) { LangLoader("chr", GetParam()); }
|
||||
TEST_P(LoadLanguage, cos) { LangLoader("cos", GetParam()); }
|
||||
TEST_P(LoadLanguage, cym) { LangLoader("cym", GetParam()); }
|
||||
TEST_P(LoadLanguage, dan) { LangLoader("dan", GetParam()); }
|
||||
TEST_P(LoadLanguage, deu) { LangLoader("deu", GetParam()); }
|
||||
TEST_P(LoadLanguage, div) { LangLoader("div", GetParam()); }
|
||||
TEST_P(LoadLanguage, dzo) { LangLoader("dzo", GetParam()); }
|
||||
TEST_P(LoadLanguage, ell) { LangLoader("ell", GetParam()); }
|
||||
TEST_P(LoadLanguage, eng) { LangLoader("eng", GetParam()); }
|
||||
TEST_P(LoadLanguage, enm) { LangLoader("enm", GetParam()); }
|
||||
TEST_P(LoadLanguage, epo) { LangLoader("epo", GetParam()); }
|
||||
TEST_P(LoadLanguage, est) { LangLoader("est", GetParam()); }
|
||||
TEST_P(LoadLanguage, eus) { LangLoader("eus", GetParam()); }
|
||||
TEST_P(LoadLanguage, fao) { LangLoader("fao", GetParam()); }
|
||||
TEST_P(LoadLanguage, fas) { LangLoader("fas", GetParam()); }
|
||||
TEST_P(LoadLanguage, fil) { LangLoader("fil", GetParam()); }
|
||||
TEST_P(LoadLanguage, fin) { LangLoader("fin", GetParam()); }
|
||||
TEST_P(LoadLanguage, fra) { LangLoader("fra", GetParam()); }
|
||||
TEST_P(LoadLanguage, frk) { LangLoader("frk", GetParam()); }
|
||||
TEST_P(LoadLanguage, frm) { LangLoader("frm", GetParam()); }
|
||||
TEST_P(LoadLanguage, fry) { LangLoader("fry", GetParam()); }
|
||||
TEST_P(LoadLanguage, gla) { LangLoader("gla", GetParam()); }
|
||||
TEST_P(LoadLanguage, gle) { LangLoader("gle", GetParam()); }
|
||||
TEST_P(LoadLanguage, glg) { LangLoader("glg", GetParam()); }
|
||||
TEST_P(LoadLanguage, grc) { LangLoader("grc", GetParam()); }
|
||||
TEST_P(LoadLanguage, guj) { LangLoader("guj", GetParam()); }
|
||||
TEST_P(LoadLanguage, hat) { LangLoader("hat", GetParam()); }
|
||||
TEST_P(LoadLanguage, heb) { LangLoader("heb", GetParam()); }
|
||||
TEST_P(LoadLanguage, hin) { LangLoader("hin", GetParam()); }
|
||||
TEST_P(LoadLanguage, hrv) { LangLoader("hrv", GetParam()); }
|
||||
TEST_P(LoadLanguage, hun) { LangLoader("hun", GetParam()); }
|
||||
TEST_P(LoadLanguage, hye) { LangLoader("hye", GetParam()); }
|
||||
TEST_P(LoadLanguage, iku) { LangLoader("iku", GetParam()); }
|
||||
TEST_P(LoadLanguage, ind) { LangLoader("ind", GetParam()); }
|
||||
TEST_P(LoadLanguage, isl) { LangLoader("isl", GetParam()); }
|
||||
TEST_P(LoadLanguage, ita) { LangLoader("ita", GetParam()); }
|
||||
TEST_P(LoadLanguage, ita_old) { LangLoader("ita_old", GetParam()); }
|
||||
TEST_P(LoadLanguage, jav) { LangLoader("jav", GetParam()); }
|
||||
TEST_P(LoadLanguage, jpn) { LangLoader("jpn", GetParam()); }
|
||||
TEST_P(LoadLanguage, jpn_vert) { LangLoader("jpn_vert", GetParam()); }
|
||||
TEST_P(LoadLanguage, kan) { LangLoader("kan", GetParam()); }
|
||||
TEST_P(LoadLanguage, kat) { LangLoader("kat", GetParam()); }
|
||||
TEST_P(LoadLanguage, kat_old) { LangLoader("kat_old", GetParam()); }
|
||||
TEST_P(LoadLanguage, kaz) { LangLoader("kaz", GetParam()); }
|
||||
TEST_P(LoadLanguage, khm) { LangLoader("khm", GetParam()); }
|
||||
TEST_P(LoadLanguage, kir) { LangLoader("kir", GetParam()); }
|
||||
TEST_P(LoadLanguage, afr) {
|
||||
LangLoader("afr", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, amh) {
|
||||
LangLoader("amh", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ara) {
|
||||
LangLoader("ara", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, asm) {
|
||||
LangLoader("asm", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, aze) {
|
||||
LangLoader("aze", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, aze_cyrl) {
|
||||
LangLoader("aze_cyrl", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, bel) {
|
||||
LangLoader("bel", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ben) {
|
||||
LangLoader("ben", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, bod) {
|
||||
LangLoader("bod", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, bos) {
|
||||
LangLoader("bos", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, bre) {
|
||||
LangLoader("bre", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, bul) {
|
||||
LangLoader("bul", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, cat) {
|
||||
LangLoader("cat", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ceb) {
|
||||
LangLoader("ceb", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ces) {
|
||||
LangLoader("ces", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, chi_sim) {
|
||||
LangLoader("chi_sim", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, chi_sim_vert) {
|
||||
LangLoader("chi_sim_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, chi_tra) {
|
||||
LangLoader("chi_tra", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, chi_tra_vert) {
|
||||
LangLoader("chi_tra_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, chr) {
|
||||
LangLoader("chr", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, cos) {
|
||||
LangLoader("cos", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, cym) {
|
||||
LangLoader("cym", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, dan) {
|
||||
LangLoader("dan", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, deu) {
|
||||
LangLoader("deu", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, div) {
|
||||
LangLoader("div", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, dzo) {
|
||||
LangLoader("dzo", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ell) {
|
||||
LangLoader("ell", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, eng) {
|
||||
LangLoader("eng", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, enm) {
|
||||
LangLoader("enm", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, epo) {
|
||||
LangLoader("epo", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, est) {
|
||||
LangLoader("est", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, eus) {
|
||||
LangLoader("eus", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, fao) {
|
||||
LangLoader("fao", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, fas) {
|
||||
LangLoader("fas", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, fil) {
|
||||
LangLoader("fil", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, fin) {
|
||||
LangLoader("fin", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, fra) {
|
||||
LangLoader("fra", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, frk) {
|
||||
LangLoader("frk", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, frm) {
|
||||
LangLoader("frm", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, fry) {
|
||||
LangLoader("fry", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, gla) {
|
||||
LangLoader("gla", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, gle) {
|
||||
LangLoader("gle", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, glg) {
|
||||
LangLoader("glg", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, grc) {
|
||||
LangLoader("grc", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, guj) {
|
||||
LangLoader("guj", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, hat) {
|
||||
LangLoader("hat", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, heb) {
|
||||
LangLoader("heb", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, hin) {
|
||||
LangLoader("hin", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, hrv) {
|
||||
LangLoader("hrv", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, hun) {
|
||||
LangLoader("hun", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, hye) {
|
||||
LangLoader("hye", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, iku) {
|
||||
LangLoader("iku", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ind) {
|
||||
LangLoader("ind", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, isl) {
|
||||
LangLoader("isl", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ita) {
|
||||
LangLoader("ita", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ita_old) {
|
||||
LangLoader("ita_old", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, jav) {
|
||||
LangLoader("jav", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, jpn) {
|
||||
LangLoader("jpn", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, jpn_vert) {
|
||||
LangLoader("jpn_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, kan) {
|
||||
LangLoader("kan", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, kat) {
|
||||
LangLoader("kat", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, kat_old) {
|
||||
LangLoader("kat_old", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, kaz) {
|
||||
LangLoader("kaz", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, khm) {
|
||||
LangLoader("khm", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, kir) {
|
||||
LangLoader("kir", GetParam());
|
||||
}
|
||||
// TEST_P(LoadLanguage, kmr) {LangLoader("kmr" , GetParam());}
|
||||
TEST_P(LoadLanguage, kor) { LangLoader("kor", GetParam()); }
|
||||
TEST_P(LoadLanguage, kor_vert) { LangLoader("kor_vert", GetParam()); }
|
||||
TEST_P(LoadLanguage, lao) { LangLoader("lao", GetParam()); }
|
||||
TEST_P(LoadLanguage, lat) { LangLoader("lat", GetParam()); }
|
||||
TEST_P(LoadLanguage, lav) { LangLoader("lav", GetParam()); }
|
||||
TEST_P(LoadLanguage, lit) { LangLoader("lit", GetParam()); }
|
||||
TEST_P(LoadLanguage, ltz) { LangLoader("ltz", GetParam()); }
|
||||
TEST_P(LoadLanguage, mal) { LangLoader("mal", GetParam()); }
|
||||
TEST_P(LoadLanguage, mar) { LangLoader("mar", GetParam()); }
|
||||
TEST_P(LoadLanguage, mkd) { LangLoader("mkd", GetParam()); }
|
||||
TEST_P(LoadLanguage, mlt) { LangLoader("mlt", GetParam()); }
|
||||
TEST_P(LoadLanguage, mon) { LangLoader("mon", GetParam()); }
|
||||
TEST_P(LoadLanguage, mri) { LangLoader("mri", GetParam()); }
|
||||
TEST_P(LoadLanguage, msa) { LangLoader("msa", GetParam()); }
|
||||
TEST_P(LoadLanguage, mya) { LangLoader("mya", GetParam()); }
|
||||
TEST_P(LoadLanguage, nep) { LangLoader("nep", GetParam()); }
|
||||
TEST_P(LoadLanguage, nld) { LangLoader("nld", GetParam()); }
|
||||
TEST_P(LoadLanguage, nor) { LangLoader("nor", GetParam()); }
|
||||
TEST_P(LoadLanguage, oci) { LangLoader("oci", GetParam()); }
|
||||
TEST_P(LoadLanguage, ori) { LangLoader("ori", GetParam()); }
|
||||
TEST_P(LoadLanguage, osd) { LangLoader("osd", GetParam()); }
|
||||
TEST_P(LoadLanguage, pan) { LangLoader("pan", GetParam()); }
|
||||
TEST_P(LoadLanguage, pol) { LangLoader("pol", GetParam()); }
|
||||
TEST_P(LoadLanguage, por) { LangLoader("por", GetParam()); }
|
||||
TEST_P(LoadLanguage, pus) { LangLoader("pus", GetParam()); }
|
||||
TEST_P(LoadLanguage, que) { LangLoader("que", GetParam()); }
|
||||
TEST_P(LoadLanguage, ron) { LangLoader("ron", GetParam()); }
|
||||
TEST_P(LoadLanguage, rus) { LangLoader("rus", GetParam()); }
|
||||
TEST_P(LoadLanguage, san) { LangLoader("san", GetParam()); }
|
||||
TEST_P(LoadLanguage, sin) { LangLoader("sin", GetParam()); }
|
||||
TEST_P(LoadLanguage, slk) { LangLoader("slk", GetParam()); }
|
||||
TEST_P(LoadLanguage, slv) { LangLoader("slv", GetParam()); }
|
||||
TEST_P(LoadLanguage, snd) { LangLoader("snd", GetParam()); }
|
||||
TEST_P(LoadLanguage, spa) { LangLoader("spa", GetParam()); }
|
||||
TEST_P(LoadLanguage, spa_old) { LangLoader("spa_old", GetParam()); }
|
||||
TEST_P(LoadLanguage, sqi) { LangLoader("sqi", GetParam()); }
|
||||
TEST_P(LoadLanguage, srp) { LangLoader("srp", GetParam()); }
|
||||
TEST_P(LoadLanguage, srp_latn) { LangLoader("srp_latn", GetParam()); }
|
||||
TEST_P(LoadLanguage, sun) { LangLoader("sun", GetParam()); }
|
||||
TEST_P(LoadLanguage, swa) { LangLoader("swa", GetParam()); }
|
||||
TEST_P(LoadLanguage, swe) { LangLoader("swe", GetParam()); }
|
||||
TEST_P(LoadLanguage, syr) { LangLoader("syr", GetParam()); }
|
||||
TEST_P(LoadLanguage, tam) { LangLoader("tam", GetParam()); }
|
||||
TEST_P(LoadLanguage, tat) { LangLoader("tat", GetParam()); }
|
||||
TEST_P(LoadLanguage, tel) { LangLoader("tel", GetParam()); }
|
||||
TEST_P(LoadLanguage, tgk) { LangLoader("tgk", GetParam()); }
|
||||
TEST_P(LoadLanguage, tha) { LangLoader("tha", GetParam()); }
|
||||
TEST_P(LoadLanguage, tir) { LangLoader("tir", GetParam()); }
|
||||
TEST_P(LoadLanguage, ton) { LangLoader("ton", GetParam()); }
|
||||
TEST_P(LoadLanguage, tur) { LangLoader("tur", GetParam()); }
|
||||
TEST_P(LoadLanguage, uig) { LangLoader("uig", GetParam()); }
|
||||
TEST_P(LoadLanguage, ukr) { LangLoader("ukr", GetParam()); }
|
||||
TEST_P(LoadLanguage, urd) { LangLoader("urd", GetParam()); }
|
||||
TEST_P(LoadLanguage, uzb) { LangLoader("uzb", GetParam()); }
|
||||
TEST_P(LoadLanguage, uzb_cyrl) { LangLoader("uzb_cyrl", GetParam()); }
|
||||
TEST_P(LoadLanguage, vie) { LangLoader("vie", GetParam()); }
|
||||
TEST_P(LoadLanguage, yid) { LangLoader("yid", GetParam()); }
|
||||
TEST_P(LoadLanguage, yor) { LangLoader("yor", GetParam()); }
|
||||
TEST_P(LoadLanguage, kor) {
|
||||
LangLoader("kor", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, kor_vert) {
|
||||
LangLoader("kor_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, lao) {
|
||||
LangLoader("lao", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, lat) {
|
||||
LangLoader("lat", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, lav) {
|
||||
LangLoader("lav", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, lit) {
|
||||
LangLoader("lit", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ltz) {
|
||||
LangLoader("ltz", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mal) {
|
||||
LangLoader("mal", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mar) {
|
||||
LangLoader("mar", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mkd) {
|
||||
LangLoader("mkd", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mlt) {
|
||||
LangLoader("mlt", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mon) {
|
||||
LangLoader("mon", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mri) {
|
||||
LangLoader("mri", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, msa) {
|
||||
LangLoader("msa", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, mya) {
|
||||
LangLoader("mya", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, nep) {
|
||||
LangLoader("nep", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, nld) {
|
||||
LangLoader("nld", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, nor) {
|
||||
LangLoader("nor", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, oci) {
|
||||
LangLoader("oci", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ori) {
|
||||
LangLoader("ori", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, osd) {
|
||||
LangLoader("osd", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, pan) {
|
||||
LangLoader("pan", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, pol) {
|
||||
LangLoader("pol", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, por) {
|
||||
LangLoader("por", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, pus) {
|
||||
LangLoader("pus", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, que) {
|
||||
LangLoader("que", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ron) {
|
||||
LangLoader("ron", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, rus) {
|
||||
LangLoader("rus", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, san) {
|
||||
LangLoader("san", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, sin) {
|
||||
LangLoader("sin", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, slk) {
|
||||
LangLoader("slk", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, slv) {
|
||||
LangLoader("slv", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, snd) {
|
||||
LangLoader("snd", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, spa) {
|
||||
LangLoader("spa", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, spa_old) {
|
||||
LangLoader("spa_old", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, sqi) {
|
||||
LangLoader("sqi", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, srp) {
|
||||
LangLoader("srp", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, srp_latn) {
|
||||
LangLoader("srp_latn", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, sun) {
|
||||
LangLoader("sun", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, swa) {
|
||||
LangLoader("swa", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, swe) {
|
||||
LangLoader("swe", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, syr) {
|
||||
LangLoader("syr", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tam) {
|
||||
LangLoader("tam", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tat) {
|
||||
LangLoader("tat", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tel) {
|
||||
LangLoader("tel", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tgk) {
|
||||
LangLoader("tgk", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tha) {
|
||||
LangLoader("tha", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tir) {
|
||||
LangLoader("tir", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ton) {
|
||||
LangLoader("ton", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, tur) {
|
||||
LangLoader("tur", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, uig) {
|
||||
LangLoader("uig", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, ukr) {
|
||||
LangLoader("ukr", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, urd) {
|
||||
LangLoader("urd", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, uzb) {
|
||||
LangLoader("uzb", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, uzb_cyrl) {
|
||||
LangLoader("uzb_cyrl", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, vie) {
|
||||
LangLoader("vie", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, yid) {
|
||||
LangLoader("yid", GetParam());
|
||||
}
|
||||
TEST_P(LoadLanguage, yor) {
|
||||
LangLoader("yor", GetParam());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadLanguage,
|
||||
::testing::Values(TESSDATA_DIR "_fast"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadLanguage,
|
||||
::testing::Values(TESSDATA_DIR "_best"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage,
|
||||
::testing::Values(TESSDATA_DIR));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage, ::testing::Values(TESSDATA_DIR));
|
||||
|
||||
// For all scripts
|
||||
|
||||
class LoadScript : public QuickTest,
|
||||
public ::testing::WithParamInterface<const char*> {};
|
||||
class LoadScript : public QuickTest, public ::testing::WithParamInterface<const char *> {};
|
||||
|
||||
TEST_P(LoadScript, Arabic) { LangLoader("script/Arabic", GetParam()); }
|
||||
TEST_P(LoadScript, Armenian) { LangLoader("script/Armenian", GetParam()); }
|
||||
TEST_P(LoadScript, Bengali) { LangLoader("script/Bengali", GetParam()); }
|
||||
TEST_P(LoadScript, Arabic) {
|
||||
LangLoader("script/Arabic", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Armenian) {
|
||||
LangLoader("script/Armenian", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Bengali) {
|
||||
LangLoader("script/Bengali", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Canadian_Aboriginal) {
|
||||
LangLoader("script/Canadian_Aboriginal", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Cherokee) { LangLoader("script/Cherokee", GetParam()); }
|
||||
TEST_P(LoadScript, Cyrillic) { LangLoader("script/Cyrillic", GetParam()); }
|
||||
TEST_P(LoadScript, Devanagari) { LangLoader("script/Devanagari", GetParam()); }
|
||||
TEST_P(LoadScript, Ethiopic) { LangLoader("script/Ethiopic", GetParam()); }
|
||||
TEST_P(LoadScript, Fraktur) { LangLoader("script/Fraktur", GetParam()); }
|
||||
TEST_P(LoadScript, Georgian) { LangLoader("script/Georgian", GetParam()); }
|
||||
TEST_P(LoadScript, Greek) { LangLoader("script/Greek", GetParam()); }
|
||||
TEST_P(LoadScript, Gujarati) { LangLoader("script/Gujarati", GetParam()); }
|
||||
TEST_P(LoadScript, Gurmukhi) { LangLoader("script/Gurmukhi", GetParam()); }
|
||||
TEST_P(LoadScript, HanS) { LangLoader("script/HanS", GetParam()); }
|
||||
TEST_P(LoadScript, HanS_vert) { LangLoader("script/HanS_vert", GetParam()); }
|
||||
TEST_P(LoadScript, HanT) { LangLoader("script/HanT", GetParam()); }
|
||||
TEST_P(LoadScript, HanT_vert) { LangLoader("script/HanT_vert", GetParam()); }
|
||||
TEST_P(LoadScript, Hangul) { LangLoader("script/Hangul", GetParam()); }
|
||||
TEST_P(LoadScript, Cherokee) {
|
||||
LangLoader("script/Cherokee", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Cyrillic) {
|
||||
LangLoader("script/Cyrillic", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Devanagari) {
|
||||
LangLoader("script/Devanagari", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Ethiopic) {
|
||||
LangLoader("script/Ethiopic", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Fraktur) {
|
||||
LangLoader("script/Fraktur", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Georgian) {
|
||||
LangLoader("script/Georgian", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Greek) {
|
||||
LangLoader("script/Greek", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Gujarati) {
|
||||
LangLoader("script/Gujarati", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Gurmukhi) {
|
||||
LangLoader("script/Gurmukhi", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, HanS) {
|
||||
LangLoader("script/HanS", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, HanS_vert) {
|
||||
LangLoader("script/HanS_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, HanT) {
|
||||
LangLoader("script/HanT", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, HanT_vert) {
|
||||
LangLoader("script/HanT_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Hangul) {
|
||||
LangLoader("script/Hangul", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Hangul_vert) {
|
||||
LangLoader("script/Hangul_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Hebrew) { LangLoader("script/Hebrew", GetParam()); }
|
||||
TEST_P(LoadScript, Japanese) { LangLoader("script/Japanese", GetParam()); }
|
||||
TEST_P(LoadScript, Hebrew) {
|
||||
LangLoader("script/Hebrew", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Japanese) {
|
||||
LangLoader("script/Japanese", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Japanese_vert) {
|
||||
LangLoader("script/Japanese_vert", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Kannada) { LangLoader("script/Kannada", GetParam()); }
|
||||
TEST_P(LoadScript, Khmer) { LangLoader("script/Khmer", GetParam()); }
|
||||
TEST_P(LoadScript, Lao) { LangLoader("script/Lao", GetParam()); }
|
||||
TEST_P(LoadScript, Latin) { LangLoader("script/Latin", GetParam()); }
|
||||
TEST_P(LoadScript, Malayalam) { LangLoader("script/Malayalam", GetParam()); }
|
||||
TEST_P(LoadScript, Myanmar) { LangLoader("script/Myanmar", GetParam()); }
|
||||
TEST_P(LoadScript, Oriya) { LangLoader("script/Oriya", GetParam()); }
|
||||
TEST_P(LoadScript, Sinhala) { LangLoader("script/Sinhala", GetParam()); }
|
||||
TEST_P(LoadScript, Syriac) { LangLoader("script/Syriac", GetParam()); }
|
||||
TEST_P(LoadScript, Tamil) { LangLoader("script/Tamil", GetParam()); }
|
||||
TEST_P(LoadScript, Telugu) { LangLoader("script/Telugu", GetParam()); }
|
||||
TEST_P(LoadScript, Thaana) { LangLoader("script/Thaana", GetParam()); }
|
||||
TEST_P(LoadScript, Thai) { LangLoader("script/Thai", GetParam()); }
|
||||
TEST_P(LoadScript, Tibetan) { LangLoader("script/Tibetan", GetParam()); }
|
||||
TEST_P(LoadScript, Vietnamese) { LangLoader("script/Vietnamese", GetParam()); }
|
||||
TEST_P(LoadScript, Kannada) {
|
||||
LangLoader("script/Kannada", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Khmer) {
|
||||
LangLoader("script/Khmer", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Lao) {
|
||||
LangLoader("script/Lao", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Latin) {
|
||||
LangLoader("script/Latin", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Malayalam) {
|
||||
LangLoader("script/Malayalam", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Myanmar) {
|
||||
LangLoader("script/Myanmar", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Oriya) {
|
||||
LangLoader("script/Oriya", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Sinhala) {
|
||||
LangLoader("script/Sinhala", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Syriac) {
|
||||
LangLoader("script/Syriac", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Tamil) {
|
||||
LangLoader("script/Tamil", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Telugu) {
|
||||
LangLoader("script/Telugu", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Thaana) {
|
||||
LangLoader("script/Thaana", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Thai) {
|
||||
LangLoader("script/Thai", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Tibetan) {
|
||||
LangLoader("script/Tibetan", GetParam());
|
||||
}
|
||||
TEST_P(LoadScript, Vietnamese) {
|
||||
LangLoader("script/Vietnamese", GetParam());
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadScript,
|
||||
::testing::Values(TESSDATA_DIR "_fast"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadScript,
|
||||
::testing::Values(TESSDATA_DIR "_best"));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript,
|
||||
::testing::Values(TESSDATA_DIR));
|
||||
INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript, ::testing::Values(TESSDATA_DIR));
|
||||
|
||||
class LoadLang : public QuickTest {};
|
||||
|
||||
// Test Load of English here, as the parameterized tests are disabled by
|
||||
// default.
|
||||
TEST_F(LoadLang, engFast) { LangLoader("eng", TESSDATA_DIR "_fast"); }
|
||||
TEST_F(LoadLang, engBest) { LangLoader("eng", TESSDATA_DIR "_best"); }
|
||||
TEST_F(LoadLang, engBestInt) { LangLoader("eng", TESSDATA_DIR); }
|
||||
TEST_F(LoadLang, engFast) {
|
||||
LangLoader("eng", TESSDATA_DIR "_fast");
|
||||
}
|
||||
TEST_F(LoadLang, engBest) {
|
||||
LangLoader("eng", TESSDATA_DIR "_best");
|
||||
}
|
||||
TEST_F(LoadLang, engBestInt) {
|
||||
LangLoader("eng", TESSDATA_DIR);
|
||||
}
|
||||
|
||||
// Use class LoadLang for languages which are NOT there in all three repos
|
||||
TEST_F(LoadLang, kmrFast) { LangLoader("kmr", TESSDATA_DIR "_fast"); }
|
||||
TEST_F(LoadLang, kmrBest) { LangLoader("kmr", TESSDATA_DIR "_best"); }
|
||||
TEST_F(LoadLang, kmrFast) {
|
||||
LangLoader("kmr", TESSDATA_DIR "_fast");
|
||||
}
|
||||
TEST_F(LoadLang, kmrBest) {
|
||||
LangLoader("kmr", TESSDATA_DIR "_best");
|
||||
}
|
||||
// TEST_F(LoadLang, kmrBestInt) {LangLoader("kmr" , TESSDATA_DIR);}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -25,15 +25,12 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
enum LogLevel {
|
||||
INFO, WARNING, ERROR, FATAL
|
||||
};
|
||||
enum LogLevel { INFO, WARNING, ERROR, FATAL };
|
||||
|
||||
// Avoid conflict with logging.h from TensorFlow.
|
||||
#undef LOG
|
||||
|
||||
static inline std::ostream& LOG(enum LogLevel level)
|
||||
{
|
||||
static inline std::ostream &LOG(enum LogLevel level) {
|
||||
switch (level) {
|
||||
case INFO:
|
||||
std::cout << "[INFO] ";
|
||||
@ -55,8 +52,7 @@ static inline std::ostream& LOG(enum LogLevel level)
|
||||
#undef QCHECK
|
||||
|
||||
// https://github.com/google/ion/blob/master/ion/base/logging.h
|
||||
static inline std::ostream& QCHECK(bool condition)
|
||||
{
|
||||
static inline std::ostream &QCHECK(bool condition) {
|
||||
if (condition) {
|
||||
static std::ostream null_stream(nullptr);
|
||||
return null_stream;
|
||||
|
@ -20,8 +20,8 @@ TEST_F(LSTMTrainerTest, TestSquashed) {
|
||||
// a small convolution/maxpool below that.
|
||||
// Match training conditions to those typically used with this spec:
|
||||
// recoding on, adam on.
|
||||
SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]",
|
||||
"SQU-2-layer-lstm", /*recode*/ true, /*adam*/ true);
|
||||
SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]", "SQU-2-layer-lstm",
|
||||
/*recode*/ true, /*adam*/ true);
|
||||
double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);
|
||||
EXPECT_LT(lstm_2d_err, 80);
|
||||
LOG(INFO) << "********** < 80 ************\n";
|
||||
|
@ -31,8 +31,7 @@ TEST_F(LSTMTrainerTest, BasicTest) {
|
||||
SetupTrainer(
|
||||
"[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 "
|
||||
"Ct1,1,64O1c1]",
|
||||
"no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false,
|
||||
2e-4, false, "eng");
|
||||
"no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false, 2e-4, false, "eng");
|
||||
double non_lstm_err = TrainIterations(kTrainerIterations * 4);
|
||||
EXPECT_LT(non_lstm_err, 98);
|
||||
LOG(INFO) << "********** Expected < 98 ************\n";
|
||||
@ -50,8 +49,7 @@ TEST_F(LSTMTrainerTest, BasicTest) {
|
||||
// Color learns almost as fast as normalized grey/2D.
|
||||
TEST_F(LSTMTrainerTest, ColorTest) {
|
||||
// A basic single-layer, single direction LSTM.
|
||||
SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||||
"2D-color-lstm", true, true);
|
||||
SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2D-color-lstm", true, true);
|
||||
double lstm_uni_err = TrainIterations(kTrainerIterations);
|
||||
EXPECT_LT(lstm_uni_err, 85);
|
||||
// EXPECT_GT(lstm_uni_err, 66);
|
||||
@ -73,8 +71,8 @@ TEST_F(LSTMTrainerTest, BidiTest) {
|
||||
// It takes a lot of iterations to get there.
|
||||
TEST_F(LSTMTrainerTest, Test2D) {
|
||||
// A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||||
"2-D-2-layer-lstm", false, false);
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
|
||||
false);
|
||||
double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);
|
||||
EXPECT_LT(lstm_2d_err, 98);
|
||||
// EXPECT_GT(lstm_2d_err, 90);
|
||||
@ -88,8 +86,8 @@ TEST_F(LSTMTrainerTest, Test2D) {
|
||||
// without it.
|
||||
TEST_F(LSTMTrainerTest, TestAdam) {
|
||||
// A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||||
"2-D-2-layer-lstm", false, true);
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
|
||||
true);
|
||||
double lstm_2d_err = TrainIterations(kTrainerIterations);
|
||||
EXPECT_LT(lstm_2d_err, 70);
|
||||
LOG(INFO) << "********** Expected < 70 ************\n";
|
||||
@ -109,16 +107,15 @@ TEST_F(LSTMTrainerTest, SpeedTest) {
|
||||
// Tests that two identical networks trained the same get the same results.
|
||||
// Also tests that the same happens with a serialize/deserialize in the middle.
|
||||
TEST_F(LSTMTrainerTest, DeterminismTest) {
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||||
"2-D-2-layer-lstm", false, false);
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
|
||||
false);
|
||||
double lstm_2d_err_a = TrainIterations(kTrainerIterations);
|
||||
double act_error_a = trainer_->ActivationError();
|
||||
double char_error_a = trainer_->CharError();
|
||||
std::vector<char> trainer_a_data;
|
||||
EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
|
||||
&trainer_a_data));
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||||
"2-D-2-layer-lstm", false, false);
|
||||
EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), &trainer_a_data));
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
|
||||
false);
|
||||
double lstm_2d_err_b = TrainIterations(kTrainerIterations);
|
||||
double act_error_b = trainer_->ActivationError();
|
||||
double char_error_b = trainer_->CharError();
|
||||
@ -130,8 +127,8 @@ TEST_F(LSTMTrainerTest, DeterminismTest) {
|
||||
act_error_b = trainer_->ActivationError();
|
||||
char_error_b = trainer_->CharError();
|
||||
// Unpack into a new trainer and train that some more too.
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
|
||||
"2-D-2-layer-lstm", false, false);
|
||||
SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", "2-D-2-layer-lstm", false,
|
||||
false);
|
||||
EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, trainer_.get()));
|
||||
lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);
|
||||
act_error_a = trainer_->ActivationError();
|
||||
@ -188,16 +185,13 @@ TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) {
|
||||
// Tests that layer access methods work correctly.
|
||||
TEST_F(LSTMTrainerTest, TestLayerAccess) {
|
||||
// A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom.
|
||||
SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm",
|
||||
false, false);
|
||||
SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm", false, false);
|
||||
// Number of layers.
|
||||
const int kNumLayers = 8;
|
||||
// Expected layer names.
|
||||
const char* kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2",
|
||||
":3:0", ":4:0", ":4:1:0", ":5"};
|
||||
const char* kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL",
|
||||
"Maxpool", "Lfys32", "Lbx128LTR",
|
||||
"Lbx128", "Output"};
|
||||
const char *kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2", ":3:0", ":4:0", ":4:1:0", ":5"};
|
||||
const char *kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL", "Maxpool",
|
||||
"Lfys32", "Lbx128LTR", "Lbx128", "Output"};
|
||||
// Expected number of weights.
|
||||
const int kNumWeights[kNumLayers] = {0,
|
||||
0,
|
||||
@ -212,8 +206,7 @@ TEST_F(LSTMTrainerTest, TestLayerAccess) {
|
||||
EXPECT_EQ(kNumLayers, layers.size());
|
||||
for (int i = 0; i < kNumLayers && i < layers.size(); ++i) {
|
||||
EXPECT_STREQ(kLayerIds[i], layers[i].c_str());
|
||||
EXPECT_STREQ(kLayerNames[i],
|
||||
trainer_->GetLayer(layers[i])->name().c_str());
|
||||
EXPECT_STREQ(kLayerNames[i], trainer_->GetLayer(layers[i])->name().c_str());
|
||||
EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights());
|
||||
}
|
||||
}
|
||||
|
@ -19,8 +19,8 @@
|
||||
#include "include_gunit.h"
|
||||
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "tprintf.h"
|
||||
#include "helpers.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#include "functions.h"
|
||||
#include "lang_model_helpers.h"
|
||||
@ -52,50 +52,45 @@ class LSTMTrainerTest : public testing::Test {
|
||||
|
||||
LSTMTrainerTest() {}
|
||||
std::string TestDataNameToPath(const std::string &name) {
|
||||
return file::JoinPath(TESTDATA_DIR,
|
||||
"" + name);
|
||||
return file::JoinPath(TESTDATA_DIR, "" + name);
|
||||
}
|
||||
std::string TessDataNameToPath(const std::string &name) {
|
||||
return file::JoinPath(TESSDATA_DIR,
|
||||
"" + name);
|
||||
return file::JoinPath(TESSDATA_DIR, "" + name);
|
||||
}
|
||||
std::string TestingNameToPath(const std::string &name) {
|
||||
return file::JoinPath(TESTING_DIR,
|
||||
"" + name);
|
||||
return file::JoinPath(TESTING_DIR, "" + name);
|
||||
}
|
||||
|
||||
void SetupTrainerEng(const std::string& network_spec, const std::string& model_name,
|
||||
bool recode, bool adam) {
|
||||
SetupTrainer(network_spec, model_name, "eng/eng.unicharset",
|
||||
"eng.Arial.exp0.lstmf", recode, adam, 5e-4, false, "eng");
|
||||
void SetupTrainerEng(const std::string &network_spec, const std::string &model_name, bool recode,
|
||||
bool adam) {
|
||||
SetupTrainer(network_spec, model_name, "eng/eng.unicharset", "eng.Arial.exp0.lstmf", recode,
|
||||
adam, 5e-4, false, "eng");
|
||||
}
|
||||
void SetupTrainer(const std::string &network_spec, const std::string &model_name,
|
||||
const std::string& unicharset_file, const std::string& lstmf_file,
|
||||
bool recode, bool adam, float learning_rate,
|
||||
bool layer_specific, const std::string& kLang) {
|
||||
const std::string &unicharset_file, const std::string &lstmf_file, bool recode,
|
||||
bool adam, float learning_rate, bool layer_specific, const std::string &kLang) {
|
||||
// constexpr char kLang[] = "eng"; // Exact value doesn't matter.
|
||||
std::string unicharset_name = TestDataNameToPath(unicharset_file);
|
||||
UNICHARSET unicharset;
|
||||
ASSERT_TRUE(unicharset.load_from_file(unicharset_name.c_str(), false));
|
||||
std::string script_dir = file::JoinPath(
|
||||
LANGDATA_DIR, "");
|
||||
std::string script_dir = file::JoinPath(LANGDATA_DIR, "");
|
||||
std::vector<STRING> words;
|
||||
EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir,
|
||||
kLang, !recode, words, words, words, false,
|
||||
nullptr, nullptr));
|
||||
EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir, kLang, !recode,
|
||||
words, words, words, false, nullptr, nullptr));
|
||||
std::string model_path = file::JoinPath(FLAGS_test_tmpdir, model_name);
|
||||
std::string checkpoint_path = model_path + "_checkpoint";
|
||||
trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(),
|
||||
0, 0));
|
||||
trainer_->InitCharSet(file::JoinPath(FLAGS_test_tmpdir, kLang,
|
||||
absl::StrCat(kLang, ".traineddata")));
|
||||
trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(), 0, 0));
|
||||
trainer_->InitCharSet(
|
||||
file::JoinPath(FLAGS_test_tmpdir, kLang, absl::StrCat(kLang, ".traineddata")));
|
||||
int net_mode = adam ? NF_ADAM : 0;
|
||||
// Adam needs a higher learning rate, due to not multiplying the effective
|
||||
// rate by 1/(1-momentum).
|
||||
if (adam) learning_rate *= 20.0f;
|
||||
if (layer_specific) net_mode |= NF_LAYER_SPECIFIC_LR;
|
||||
EXPECT_TRUE(trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1,
|
||||
learning_rate, 0.9, 0.999));
|
||||
if (adam)
|
||||
learning_rate *= 20.0f;
|
||||
if (layer_specific)
|
||||
net_mode |= NF_LAYER_SPECIFIC_LR;
|
||||
EXPECT_TRUE(
|
||||
trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1, learning_rate, 0.9, 0.999));
|
||||
std::vector<STRING> filenames;
|
||||
filenames.push_back(STRING(TestDataNameToPath(lstmf_file).c_str()));
|
||||
EXPECT_TRUE(trainer_->LoadAllTrainingData(filenames, CS_SEQUENTIAL, false));
|
||||
@ -119,7 +114,8 @@ class LSTMTrainerTest : public testing::Test {
|
||||
trainer_->MaintainCheckpoints(nullptr, &log_str);
|
||||
iteration = trainer_->training_iteration();
|
||||
mean_error *= 100.0 / kBatchIterations;
|
||||
if (mean_error < best_error) best_error = mean_error;
|
||||
if (mean_error < best_error)
|
||||
best_error = mean_error;
|
||||
} while (iteration < iteration_limit);
|
||||
LOG(INFO) << "Trainer error rate = " << best_error << "\n";
|
||||
return best_error;
|
||||
@ -134,8 +130,7 @@ class LSTMTrainerTest : public testing::Test {
|
||||
const ImageData &trainingdata =
|
||||
*trainer_->mutable_training_data()->GetPageBySerial(iteration);
|
||||
NetworkIO fwd_outputs, targets;
|
||||
if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) !=
|
||||
UNENCODABLE) {
|
||||
if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) != UNENCODABLE) {
|
||||
mean_error += trainer_->NewSingleError(ET_CHAR_ERROR);
|
||||
++error_count;
|
||||
}
|
||||
@ -150,8 +145,7 @@ class LSTMTrainerTest : public testing::Test {
|
||||
// int.
|
||||
double TestIntMode(int test_iterations) {
|
||||
std::vector<char> trainer_data;
|
||||
EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
|
||||
&trainer_data));
|
||||
EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), &trainer_data));
|
||||
// Get the error on the next few iterations in float mode.
|
||||
double float_err = TestIterations(test_iterations);
|
||||
// Restore the dump, convert to int and test error on that.
|
||||
@ -167,8 +161,8 @@ class LSTMTrainerTest : public testing::Test {
|
||||
void TestEncodeDecode(const std::string &lang, const std::string &str, bool recode) {
|
||||
std::string unicharset_name = lang + "/" + lang + ".unicharset";
|
||||
std::string lstmf_name = lang + ".Arial_Unicode_MS.exp0.lstmf";
|
||||
SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name,
|
||||
lstmf_name, recode, true, 5e-4f, true, lang);
|
||||
SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name, lstmf_name, recode, true,
|
||||
5e-4f, true, lang);
|
||||
std::vector<int> labels;
|
||||
EXPECT_TRUE(trainer_->EncodeString(str.c_str(), &labels));
|
||||
STRING decoded = trainer_->DecodeLabels(labels);
|
||||
|
@ -16,8 +16,7 @@
|
||||
namespace tesseract {
|
||||
|
||||
TEST_F(LSTMTrainerTest, EncodesEng) {
|
||||
TestEncodeDecodeBoth("eng",
|
||||
"The quick brown 'fox' jumps over: the lazy dog!");
|
||||
TestEncodeDecodeBoth("eng", "The quick brown 'fox' jumps over: the lazy dog!");
|
||||
}
|
||||
|
||||
TEST_F(LSTMTrainerTest, EncodesKan) {
|
||||
@ -25,8 +24,7 @@ TEST_F(LSTMTrainerTest, EncodesKan) {
|
||||
}
|
||||
|
||||
TEST_F(LSTMTrainerTest, EncodesKor) {
|
||||
TestEncodeDecodeBoth("kor",
|
||||
"이는 것으로 다시 넣을 수는 있지만 선택의 의미는");
|
||||
TestEncodeDecodeBoth("kor", "이는 것으로 다시 넣을 수는 있지만 선택의 의미는");
|
||||
}
|
||||
|
||||
TEST_F(LSTMTrainerTest, MapCoder) {
|
||||
@ -47,16 +45,15 @@ TEST_F(LSTMTrainerTest, MapCoder) {
|
||||
std::vector<int> fra_labels;
|
||||
EXPECT_TRUE(fra_trainer.EncodeString(kTestStr.c_str(), &fra_labels));
|
||||
// Use the mapper to compute what the labels are as deu.
|
||||
std::vector<int> mapping = fra_trainer.MapRecoder(deu_trainer.GetUnicharset(),
|
||||
deu_trainer.GetRecoder());
|
||||
std::vector<int> mapping =
|
||||
fra_trainer.MapRecoder(deu_trainer.GetUnicharset(), deu_trainer.GetRecoder());
|
||||
std::vector<int> mapped_fra_labels(fra_labels.size(), -1);
|
||||
for (int i = 0; i < fra_labels.size(); ++i) {
|
||||
mapped_fra_labels[i] = mapping[fra_labels[i]];
|
||||
EXPECT_NE(-1, mapped_fra_labels[i]) << "i=" << i << ", ch=" << kTestStr[i];
|
||||
EXPECT_EQ(mapped_fra_labels[i], deu_labels[i])
|
||||
<< "i=" << i << ", ch=" << kTestStr[i]
|
||||
<< " has deu label=" << deu_labels[i] << ", but mapped to "
|
||||
<< mapped_fra_labels[i];
|
||||
<< "i=" << i << ", ch=" << kTestStr[i] << " has deu label=" << deu_labels[i]
|
||||
<< ", but mapped to " << mapped_fra_labels[i];
|
||||
}
|
||||
// The german trainer can now decode them correctly.
|
||||
STRING decoded = deu_trainer.DecodeLabels(mapped_fra_labels);
|
||||
@ -73,8 +70,7 @@ TEST_F(LSTMTrainerTest, ConvertModel) {
|
||||
deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata"));
|
||||
// Load the fra traineddata, strip out the model, and save to a tmp file.
|
||||
TessdataManager mgr;
|
||||
std::string fra_data =
|
||||
file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata");
|
||||
std::string fra_data = file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata");
|
||||
CHECK(mgr.Init(fra_data.c_str()));
|
||||
LOG(INFO) << "Load " << fra_data << "\n";
|
||||
file::MakeTmpdir();
|
||||
@ -96,8 +92,8 @@ TEST_F(LSTMTrainerTest, ConvertModel) {
|
||||
api.SetImage(src_pix);
|
||||
std::unique_ptr<char[]> result(api.GetUTF8Text());
|
||||
std::string truth_text;
|
||||
CHECK_OK(file::GetContents(TestingNameToPath("phototest.gold.txt"),
|
||||
&truth_text, file::Defaults()));
|
||||
CHECK_OK(
|
||||
file::GetContents(TestingNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
|
||||
|
||||
EXPECT_STREQ(truth_text.c_str(), result.get());
|
||||
pixDestroy(&src_pix);
|
||||
|
@ -23,14 +23,14 @@
|
||||
|
||||
#include "include_gunit.h"
|
||||
|
||||
#include "log.h" // for LOG
|
||||
#include "unicharset.h"
|
||||
#include "commontraining.h"
|
||||
#include "errorcounter.h"
|
||||
#include "log.h" // for LOG
|
||||
#include "mastertrainer.h"
|
||||
#include "shapeclassifier.h"
|
||||
#include "shapetable.h"
|
||||
#include "trainingsample.h"
|
||||
#include "commontraining.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
#include "absl/strings/numbers.h" // for safe_strto32
|
||||
#include "absl/strings/str_split.h" // for absl::StrSplit
|
||||
@ -51,12 +51,10 @@ static const int kNumCorrect = kNumNonReject - kNumTop1Errs;
|
||||
// The total number of answers is given by the number of non-rejects plus
|
||||
// all the multiple answers.
|
||||
static const int kNumAnswers = kNumNonReject + 2 * (kNumTop2Errs - kNumTopNErrs) +
|
||||
(kNumTop1Errs - kNumTop2Errs) +
|
||||
(kNumTopTopErrs - kNumTop1Errs);
|
||||
(kNumTop1Errs - kNumTop2Errs) + (kNumTopTopErrs - kNumTop1Errs);
|
||||
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
static bool safe_strto32(const std::string& str, int* pResult)
|
||||
{
|
||||
static bool safe_strto32(const std::string &str, int *pResult) {
|
||||
long n = strtol(str.c_str(), nullptr, 0);
|
||||
*pResult = n;
|
||||
return true;
|
||||
@ -83,12 +81,12 @@ class MockClassifier : public ShapeClassifier {
|
||||
// If keep_this (a shape index) is >= 0, then the results should always
|
||||
// contain keep_this, and (if possible) anything of intermediate confidence.
|
||||
// The return value is the number of classes saved in results.
|
||||
int ClassifySample(const TrainingSample& sample, Pix* page_pix,
|
||||
int debug, UNICHAR_ID keep_this,
|
||||
int ClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this,
|
||||
std::vector<ShapeRating> *results) override {
|
||||
results->clear();
|
||||
// Everything except the first kNumNonReject is a reject.
|
||||
if (++num_done_ > kNumNonReject) return 0;
|
||||
if (++num_done_ > kNumNonReject)
|
||||
return 0;
|
||||
|
||||
int class_id = sample.class_id();
|
||||
int font_id = sample.font_id();
|
||||
@ -125,7 +123,9 @@ class MockClassifier : public ShapeClassifier {
|
||||
return results->size();
|
||||
}
|
||||
// Provides access to the ShapeTable that this classifier works with.
|
||||
const ShapeTable* GetShapeTable() const override { return shape_table_; }
|
||||
const ShapeTable *GetShapeTable() const override {
|
||||
return shape_table_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Borrowed pointer to the ShapeTable.
|
||||
@ -180,8 +180,7 @@ class MasterTrainerTest : public testing::Test {
|
||||
STRING file_prefix;
|
||||
delete shape_table_;
|
||||
shape_table_ = nullptr;
|
||||
master_trainer_ =
|
||||
LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix);
|
||||
master_trainer_ = LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix);
|
||||
EXPECT_TRUE(master_trainer_ != nullptr);
|
||||
EXPECT_TRUE(shape_table_ != nullptr);
|
||||
}
|
||||
@ -207,29 +206,23 @@ class MasterTrainerTest : public testing::Test {
|
||||
int shape_1 = shape_table_->FindShape(unichar_1, font_id);
|
||||
EXPECT_GE(shape_1, 0);
|
||||
|
||||
float dist_I_l =
|
||||
master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l);
|
||||
float dist_I_l = master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l);
|
||||
// No tolerance here. We expect that I and l should match exactly.
|
||||
EXPECT_EQ(0.0f, dist_I_l);
|
||||
float dist_l_I =
|
||||
master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I);
|
||||
float dist_l_I = master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I);
|
||||
// BOTH ways.
|
||||
EXPECT_EQ(0.0f, dist_l_I);
|
||||
|
||||
// l/1 on the other hand should be distinct.
|
||||
float dist_l_1 =
|
||||
master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1);
|
||||
float dist_l_1 = master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1);
|
||||
EXPECT_GT(dist_l_1, kMin1lDistance);
|
||||
float dist_1_l =
|
||||
master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l);
|
||||
float dist_1_l = master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l);
|
||||
EXPECT_GT(dist_1_l, kMin1lDistance);
|
||||
|
||||
// So should I/1.
|
||||
float dist_I_1 =
|
||||
master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1);
|
||||
float dist_I_1 = master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1);
|
||||
EXPECT_GT(dist_I_1, kMin1lDistance);
|
||||
float dist_1_I =
|
||||
master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I);
|
||||
float dist_1_I = master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I);
|
||||
EXPECT_GT(dist_1_I, kMin1lDistance);
|
||||
}
|
||||
|
||||
@ -263,18 +256,17 @@ TEST_F(MasterTrainerTest, ErrorCounterTest) {
|
||||
LoadMasterTrainer();
|
||||
// Add the space character to the shape_table_ if not already present to
|
||||
// count junk.
|
||||
if (shape_table_->FindShape(0, -1) < 0) shape_table_->AddShape(0, 0);
|
||||
if (shape_table_->FindShape(0, -1) < 0)
|
||||
shape_table_->AddShape(0, 0);
|
||||
// Make a mock classifier.
|
||||
auto shape_classifier = std::make_unique<MockClassifier>(shape_table_);
|
||||
// Get the accuracy report.
|
||||
STRING accuracy_report;
|
||||
master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0,
|
||||
false, shape_classifier.get(),
|
||||
&accuracy_report);
|
||||
master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0, false,
|
||||
shape_classifier.get(), &accuracy_report);
|
||||
LOG(INFO) << accuracy_report.c_str();
|
||||
std::string result_string = accuracy_report.c_str();
|
||||
std::vector<std::string> results =
|
||||
absl::StrSplit(result_string, '\t', absl::SkipEmpty());
|
||||
std::vector<std::string> results = absl::StrSplit(result_string, '\t', absl::SkipEmpty());
|
||||
EXPECT_EQ(tesseract::CT_SIZE + 1, results.size());
|
||||
int result_values[tesseract::CT_SIZE];
|
||||
for (int i = 0; i < tesseract::CT_SIZE; ++i) {
|
||||
@ -290,8 +282,7 @@ TEST_F(MasterTrainerTest, ErrorCounterTest) {
|
||||
EXPECT_EQ(kNumTop2Errs, result_values[tesseract::CT_UNICHAR_TOP2_ERR]);
|
||||
EXPECT_EQ(kNumTopNErrs, result_values[tesseract::CT_UNICHAR_TOPN_ERR]);
|
||||
// Each of the TOPTOP errs also counts as a multi-unichar.
|
||||
EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs,
|
||||
result_values[tesseract::CT_OK_MULTI_UNICHAR]);
|
||||
EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs, result_values[tesseract::CT_OK_MULTI_UNICHAR]);
|
||||
EXPECT_EQ(num_samples - kNumNonReject, result_values[tesseract::CT_REJECT]);
|
||||
EXPECT_EQ(kNumAnswers, result_values[tesseract::CT_NUM_RESULTS]);
|
||||
#endif
|
||||
|
@ -38,7 +38,8 @@ class MatrixTest : public ::testing::Test {
|
||||
for (int i = 0; i < kInputSize_; ++i) {
|
||||
src_.put(0, i, i);
|
||||
}
|
||||
for (int i = 0; i < kNumDims_; ++i) dims_[i] = 5 - i;
|
||||
for (int i = 0; i < kNumDims_; ++i)
|
||||
dims_[i] = 5 - i;
|
||||
}
|
||||
// Number of dimensions in src_.
|
||||
static const int kNumDims_ = 4;
|
||||
@ -134,4 +135,4 @@ TEST_F(MatrixTest, RotatingTranspose_0_2) {
|
||||
EXPECT_EQ(6, m(15, 0));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,8 +9,8 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "networkio.h"
|
||||
#include "include_gunit.h"
|
||||
#include "stridemap.h"
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
# include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D
|
||||
@ -44,8 +44,7 @@ class NetworkioTest : public ::testing::Test {
|
||||
arrays.push_back(SetupArray(4, 5, 12));
|
||||
std::vector<std::pair<int, int>> h_w_sizes;
|
||||
for (size_t i = 0; i < arrays.size(); ++i) {
|
||||
h_w_sizes.emplace_back(arrays[i].get()->height(),
|
||||
arrays[i].get()->width());
|
||||
h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());
|
||||
}
|
||||
StrideMap stride_map;
|
||||
stride_map.SetStride(h_w_sizes);
|
||||
@ -53,8 +52,7 @@ class NetworkioTest : public ::testing::Test {
|
||||
// Iterate over the map, setting nio's contents from the arrays.
|
||||
StrideMap::Index index(stride_map);
|
||||
do {
|
||||
int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH));
|
||||
int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT), index.index(FD_WIDTH));
|
||||
nio->SetPixel(index.t(), 0, 128 + value, 0.0f, 128.0f);
|
||||
nio->SetPixel(index.t(), 1, 128 - value, 0.0f, 128.0f);
|
||||
} while (index.Increment());
|
||||
@ -113,9 +111,9 @@ TEST_F(NetworkioTest, CopyWithYReversal) {
|
||||
StrideMap::Index index(copy.stride_map());
|
||||
int next_t = 0;
|
||||
int pos = 0;
|
||||
std::vector<int> expected_values = {
|
||||
8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 27, 28, 29, 30,
|
||||
31, 22, 23, 24, 25, 26, 17, 18, 19, 20, 21, 12, 13, 14, 15, 16};
|
||||
std::vector<int> expected_values = {8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2,
|
||||
3, 27, 28, 29, 30, 31, 22, 23, 24, 25, 26,
|
||||
17, 18, 19, 20, 21, 12, 13, 14, 15, 16};
|
||||
do {
|
||||
int t = index.t();
|
||||
// The indexed values match the expected values.
|
||||
@ -150,9 +148,9 @@ TEST_F(NetworkioTest, CopyWithXReversal) {
|
||||
StrideMap::Index index(copy.stride_map());
|
||||
int next_t = 0;
|
||||
int pos = 0;
|
||||
std::vector<int> expected_values = {
|
||||
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 16, 15, 14, 13,
|
||||
12, 21, 20, 19, 18, 17, 26, 25, 24, 23, 22, 31, 30, 29, 28, 27};
|
||||
std::vector<int> expected_values = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
|
||||
8, 16, 15, 14, 13, 12, 21, 20, 19, 18, 17,
|
||||
26, 25, 24, 23, 22, 31, 30, 29, 28, 27};
|
||||
do {
|
||||
int t = index.t();
|
||||
// The indexed values match the expected values.
|
||||
@ -187,9 +185,9 @@ TEST_F(NetworkioTest, CopyWithXYTranspose) {
|
||||
StrideMap::Index index(copy.stride_map());
|
||||
int next_t = 0;
|
||||
int pos = 0;
|
||||
std::vector<int> expected_values = {
|
||||
0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 17, 22, 27,
|
||||
13, 18, 23, 28, 14, 19, 24, 29, 15, 20, 25, 30, 16, 21, 26, 31};
|
||||
std::vector<int> expected_values = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7,
|
||||
11, 12, 17, 22, 27, 13, 18, 23, 28, 14, 19,
|
||||
24, 29, 15, 20, 25, 30, 16, 21, 26, 31};
|
||||
do {
|
||||
int t = index.t();
|
||||
// The indexed values match the expected values.
|
||||
@ -214,4 +212,4 @@ TEST_F(NetworkioTest, CopyWithXYTranspose) {
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,11 +9,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "normstrngs.h"
|
||||
#include <tesseract/unichar.h>
|
||||
#include "absl/strings/str_format.h" // for absl::StrFormat
|
||||
#include "include_gunit.h"
|
||||
#include "normstrngs.h"
|
||||
#include "normstrngs_test.h"
|
||||
#include <tesseract/unichar.h>
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
# include "util/utf8/unilib.h" // for UniLib
|
||||
#endif
|
||||
@ -33,8 +33,7 @@ TEST(NormstrngsTest, BasicText) {
|
||||
const char *kBasicText = "AbCd Ef";
|
||||
std::string result;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
|
||||
GraphemeNorm::kNormalize, kBasicText,
|
||||
&result));
|
||||
GraphemeNorm::kNormalize, kBasicText, &result));
|
||||
EXPECT_STREQ(kBasicText, result.c_str());
|
||||
}
|
||||
|
||||
@ -42,14 +41,12 @@ TEST(NormstrngsTest, LigatureText) {
|
||||
const char *kTwoByteLigText = "ij"; // U+0133 (ij) -> ij
|
||||
std::string result;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
|
||||
GraphemeNorm::kNormalize, kTwoByteLigText,
|
||||
&result));
|
||||
GraphemeNorm::kNormalize, kTwoByteLigText, &result));
|
||||
EXPECT_STREQ("ij", result.c_str());
|
||||
|
||||
const char *kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
|
||||
GraphemeNorm::kNormalize, kThreeByteLigText,
|
||||
&result));
|
||||
GraphemeNorm::kNormalize, kThreeByteLigText, &result));
|
||||
EXPECT_STREQ("finds", result.c_str());
|
||||
}
|
||||
|
||||
@ -57,14 +54,12 @@ TEST(NormstrngsTest, OcrSpecificNormalization) {
|
||||
const char *kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (')
|
||||
std::string result;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
|
||||
GraphemeNorm::kNormalize, kSingleQuoteText,
|
||||
&result));
|
||||
GraphemeNorm::kNormalize, kSingleQuoteText, &result));
|
||||
EXPECT_STREQ("'Hi", result.c_str());
|
||||
|
||||
const char *kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (")
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
|
||||
GraphemeNorm::kNormalize, kDoubleQuoteText,
|
||||
&result));
|
||||
GraphemeNorm::kNormalize, kDoubleQuoteText, &result));
|
||||
EXPECT_STREQ("\"Hi", result.c_str());
|
||||
|
||||
const char *kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-)
|
||||
@ -72,16 +67,14 @@ TEST(NormstrngsTest, OcrSpecificNormalization) {
|
||||
GraphemeNorm::kNormalize, kEmDash, &result));
|
||||
EXPECT_STREQ("Hi-", result.c_str());
|
||||
// Without the ocr normalization, these changes are not made.
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, kSingleQuoteText,
|
||||
&result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
kSingleQuoteText, &result));
|
||||
EXPECT_STREQ(kSingleQuoteText, result.c_str());
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, kDoubleQuoteText,
|
||||
&result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
kDoubleQuoteText, &result));
|
||||
EXPECT_STREQ(kDoubleQuoteText, result.c_str());
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, kEmDash, &result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
kEmDash, &result));
|
||||
EXPECT_STREQ(kEmDash, result.c_str());
|
||||
}
|
||||
|
||||
@ -90,38 +83,35 @@ const char kEngText[] = "the quick brown fox jumps over the lazy dog";
|
||||
const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
|
||||
const char kKorText[] = "이는 것으로";
|
||||
// Hindi words containing illegal vowel sequences.
|
||||
const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात",
|
||||
"कहीअे", "पत्रिाका", "छह्णाीस"};
|
||||
const char *kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात", "कहीअे", "पत्रिाका", "छह्णाीस"};
|
||||
// Thai illegal sequences.
|
||||
const char *kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};
|
||||
|
||||
TEST(NormstrngsTest, DetectsCorrectText) {
|
||||
std::string chars;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, kEngText, &chars));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
kEngText, &chars));
|
||||
EXPECT_STREQ(kEngText, chars.c_str());
|
||||
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, kHinText, &chars))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
kHinText, &chars))
|
||||
<< "Incorrect text: '" << kHinText << "'";
|
||||
EXPECT_STREQ(kHinText, chars.c_str());
|
||||
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, kKorText, &chars));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
kKorText, &chars));
|
||||
EXPECT_STREQ(kKorText, chars.c_str());
|
||||
}
|
||||
|
||||
TEST(NormstrngsTest, DetectsIncorrectText) {
|
||||
for (size_t i = 0; i < countof(kBadlyFormedHinWords); ++i) {
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize,
|
||||
kBadlyFormedHinWords[i], nullptr))
|
||||
GraphemeNorm::kNormalize, kBadlyFormedHinWords[i], nullptr))
|
||||
<< kBadlyFormedHinWords[i];
|
||||
}
|
||||
for (size_t i = 0; i < countof(kBadlyFormedThaiWords); ++i) {
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize,
|
||||
kBadlyFormedThaiWords[i], nullptr))
|
||||
GraphemeNorm::kNormalize, kBadlyFormedThaiWords[i], nullptr))
|
||||
<< kBadlyFormedThaiWords[i];
|
||||
}
|
||||
}
|
||||
@ -129,9 +119,8 @@ TEST(NormstrngsTest, DetectsIncorrectText) {
|
||||
TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
|
||||
std::string nonindic = "Here's some latin text.";
|
||||
std::string dest;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, nonindic.c_str(),
|
||||
&dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
nonindic.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(nonindic);
|
||||
EXPECT_EQ(dest, nonindic);
|
||||
}
|
||||
@ -140,9 +129,8 @@ TEST(NormstrngsTest, NoLonelyJoiners) {
|
||||
std::string str = "x\u200d\u0d06\u0d34\u0d02";
|
||||
std::vector<std::string> glyphs;
|
||||
// Returns true, but the joiner is gone.
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[0], std::string("x"));
|
||||
@ -154,9 +142,8 @@ TEST(NormstrngsTest, NoLonelyJoinersPlus) {
|
||||
std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
|
||||
std::vector<std::string> glyphs;
|
||||
// Returns true, but the joiner is gone.
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
|
||||
@ -171,9 +158,8 @@ TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
|
||||
str = "\u200d\u200c\u200d";
|
||||
// Without the plus, the string is invalid.
|
||||
std::string result;
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result))
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result))
|
||||
<< PrintString32WithUnicodes(result);
|
||||
}
|
||||
|
||||
@ -385,8 +371,7 @@ TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
|
||||
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
|
||||
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
|
||||
std::string str = EncodeAsUTF8(ch);
|
||||
EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
|
||||
IsInterchangeValid7BitAscii(ch));
|
||||
EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str), IsInterchangeValid7BitAscii(ch));
|
||||
}
|
||||
#else
|
||||
// Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
|
||||
@ -409,7 +394,8 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
|
||||
const int32_t kMinUnicodeValue = 33;
|
||||
const int32_t kMaxUnicodeValue = 0x10FFFF;
|
||||
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
|
||||
if (!IsValidCodepoint(ch)) continue;
|
||||
if (!IsValidCodepoint(ch))
|
||||
continue;
|
||||
SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
|
||||
std::string str = EncodeAsUTF8(ch);
|
||||
const std::string expected_half_str =
|
||||
|
@ -12,12 +12,12 @@
|
||||
#ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
|
||||
#define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
|
||||
|
||||
#include <tesseract/unichar.h>
|
||||
#include <sstream> // for std::stringstream
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_join.h"
|
||||
#include <tesseract/unichar.h>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -46,36 +46,28 @@ inline std::string PrintStringVectorWithUnicodes(const std::vector<std::string>&
|
||||
}
|
||||
|
||||
inline void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode,
|
||||
int unicode_count, int glyph_count,
|
||||
int grapheme_count,
|
||||
int unicode_count, int glyph_count, int grapheme_count,
|
||||
const std::string &target_str) {
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true,
|
||||
str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), unicode_count)
|
||||
<< PrintStringVectorWithUnicodes(glyphs);
|
||||
u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true, str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), unicode_count) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true,
|
||||
str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), glyph_count)
|
||||
<< PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), glyph_count) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), grapheme_count)
|
||||
<< PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kCombined,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), grapheme_count) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
|
||||
GraphemeNormMode::kSingleString,
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, GraphemeNormMode::kSingleString,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(target_str, glyphs[0]);
|
||||
std::string result;
|
||||
EXPECT_TRUE(NormalizeUTF8String(
|
||||
u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result));
|
||||
EXPECT_TRUE(
|
||||
NormalizeUTF8String(u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result));
|
||||
EXPECT_EQ(target_str, result);
|
||||
}
|
||||
|
||||
|
@ -19,12 +19,12 @@
|
||||
// expects clones of tessdata, tessdata_fast and tessdata_best repos
|
||||
|
||||
//#include "log.h"
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <iostream>
|
||||
#include <memory> // std::unique_ptr
|
||||
#include <string>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include "include_gunit.h"
|
||||
#include <allheaders.h>
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -36,8 +36,7 @@ class TestClass : public testing::Test {
|
||||
static void OSDTester(int expected_deg, const char *imgname, const char *tessdatadir) {
|
||||
// log.info() << tessdatadir << " for image: " << imgname << std::endl;
|
||||
std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
|
||||
ASSERT_FALSE(api->Init(tessdatadir, "osd"))
|
||||
<< "Could not initialize tesseract.";
|
||||
ASSERT_FALSE(api->Init(tessdatadir, "osd")) << "Could not initialize tesseract.";
|
||||
Pix *image = pixRead(imgname);
|
||||
ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
|
||||
api->SetImage(image);
|
||||
@ -45,8 +44,8 @@ static void OSDTester(int expected_deg, const char* imgname, const char* tessdat
|
||||
float orient_conf;
|
||||
const char *script_name;
|
||||
float script_conf;
|
||||
bool detected = api->DetectOrientationScript(&orient_deg, &orient_conf,
|
||||
&script_name, &script_conf);
|
||||
bool detected =
|
||||
api->DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf);
|
||||
ASSERT_FALSE(!detected) << "Failed to detect OSD.";
|
||||
printf(
|
||||
"************ Orientation in degrees: %d, Orientation confidence: %.2f\n"
|
||||
@ -59,75 +58,66 @@ static void OSDTester(int expected_deg, const char* imgname, const char* tessdat
|
||||
#endif
|
||||
|
||||
class OSDTest : public TestClass,
|
||||
public ::testing::WithParamInterface<
|
||||
std::tuple<int, const char*, const char*>> {};
|
||||
public ::testing::WithParamInterface<std::tuple<int, const char *, const char *>> {
|
||||
};
|
||||
|
||||
TEST_P(OSDTest, MatchOrientationDegrees) {
|
||||
#ifdef DISABLED_LEGACY_ENGINE
|
||||
// Skip test because TessBaseAPI::DetectOrientationScript is missing.
|
||||
GTEST_SKIP();
|
||||
#else
|
||||
OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()),
|
||||
std::get<2>(GetParam()));
|
||||
OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()), std::get<2>(GetParam()));
|
||||
#endif
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataEngEuroHebrew, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataEngEuroHebrew, OSDTest,
|
||||
::testing::Combine(::testing::Values(0),
|
||||
::testing::Values(TESTING_DIR "/phototest.tif",
|
||||
TESTING_DIR "/eurotext.tif",
|
||||
TESTING_DIR "/hebrew.png"),
|
||||
::testing::Values(TESSDATA_DIR)));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataBestEngEuroHebrew, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataBestEngEuroHebrew, OSDTest,
|
||||
::testing::Combine(::testing::Values(0),
|
||||
::testing::Values(TESTING_DIR "/phototest.tif",
|
||||
TESTING_DIR "/eurotext.tif",
|
||||
TESTING_DIR "/hebrew.png"),
|
||||
::testing::Values(TESSDATA_DIR "_best")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataFastEngEuroHebrew, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataFastEngEuroHebrew, OSDTest,
|
||||
::testing::Combine(::testing::Values(0),
|
||||
::testing::Values(TESTING_DIR "/phototest.tif",
|
||||
TESTING_DIR "/eurotext.tif",
|
||||
TESTING_DIR "/hebrew.png"),
|
||||
::testing::Values(TESSDATA_DIR "_fast")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataFastRotated90, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataFastRotated90, OSDTest,
|
||||
::testing::Combine(::testing::Values(90),
|
||||
::testing::Values(TESTING_DIR
|
||||
"/phototest-rotated-R.png"),
|
||||
::testing::Values(TESSDATA_DIR "_fast")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataFastRotated180, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataFastRotated180, OSDTest,
|
||||
::testing::Combine(::testing::Values(180),
|
||||
::testing::Values(TESTING_DIR
|
||||
"/phototest-rotated-180.png"),
|
||||
::testing::Values(TESSDATA_DIR "_fast")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataFastRotated270, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataFastRotated270, OSDTest,
|
||||
::testing::Combine(::testing::Values(270),
|
||||
::testing::Values(TESTING_DIR
|
||||
"/phototest-rotated-L.png"),
|
||||
::testing::Values(TESSDATA_DIR "_fast")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataFastDevaRotated270, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataFastDevaRotated270, OSDTest,
|
||||
::testing::Combine(::testing::Values(270),
|
||||
::testing::Values(TESTING_DIR
|
||||
"/devatest-rotated-270.png"),
|
||||
::testing::Values(TESSDATA_DIR "_fast")));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
TessdataFastDeva, OSDTest,
|
||||
INSTANTIATE_TEST_SUITE_P(TessdataFastDeva, OSDTest,
|
||||
::testing::Combine(::testing::Values(0),
|
||||
::testing::Values(TESTING_DIR "/devatest.png"),
|
||||
::testing::Values(TESSDATA_DIR "_fast")));
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -14,12 +14,12 @@
|
||||
#else
|
||||
# include <unistd.h> // for access
|
||||
#endif
|
||||
#include <string>
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <string>
|
||||
#include "helpers.h"
|
||||
#include "log.h"
|
||||
#include "include_gunit.h"
|
||||
#include "log.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -54,8 +54,8 @@ class PageSegModeTest : public testing::Test {
|
||||
|
||||
// Tests that the given rectangle produces exactly the given text in the
|
||||
// given segmentation mode (after chopping off the last 2 newlines.)
|
||||
void VerifyRectText(tesseract::PageSegMode mode, const char* str,
|
||||
int left, int top, int width, int height) {
|
||||
void VerifyRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width,
|
||||
int height) {
|
||||
api_.SetPageSegMode(mode);
|
||||
api_.SetRectangle(left, top, width, height);
|
||||
char *result = api_.GetUTF8Text();
|
||||
@ -67,8 +67,8 @@ class PageSegModeTest : public testing::Test {
|
||||
|
||||
// Tests that the given rectangle does NOT produce the given text in the
|
||||
// given segmentation mode.
|
||||
void NotRectText(tesseract::PageSegMode mode, const char* str,
|
||||
int left, int top, int width, int height) {
|
||||
void NotRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width,
|
||||
int height) {
|
||||
api_.SetPageSegMode(mode);
|
||||
api_.SetRectangle(left, top, width, height);
|
||||
char *result = api_.GetUTF8Text();
|
||||
@ -95,20 +95,15 @@ TEST_F(PageSegModeTest, WordTest) {
|
||||
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1411, 252, 78, 62);
|
||||
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1396, 218, 114, 102);
|
||||
// Test a random pair of words as a line
|
||||
VerifyRectText(tesseract::PSM_SINGLE_LINE,
|
||||
"What should", 237, 393, 256, 36);
|
||||
VerifyRectText(tesseract::PSM_SINGLE_LINE, "What should", 237, 393, 256, 36);
|
||||
// Test a random pair of words as a word
|
||||
VerifyRectText(tesseract::PSM_SINGLE_WORD,
|
||||
"Whatshould", 237, 393, 256, 36);
|
||||
VerifyRectText(tesseract::PSM_SINGLE_WORD, "Whatshould", 237, 393, 256, 36);
|
||||
// Test single block mode.
|
||||
VerifyRectText(tesseract::PSM_SINGLE_BLOCK,
|
||||
"both the\nfrom the", 237, 450, 172, 94);
|
||||
VerifyRectText(tesseract::PSM_SINGLE_BLOCK, "both the\nfrom the", 237, 450, 172, 94);
|
||||
// But doesn't work in line or word mode.
|
||||
NotRectText(tesseract::PSM_SINGLE_LINE,
|
||||
"both the\nfrom the", 237, 450, 172, 94);
|
||||
NotRectText(tesseract::PSM_SINGLE_WORD,
|
||||
"both the\nfrom the", 237, 450, 172, 94);
|
||||
NotRectText(tesseract::PSM_SINGLE_LINE, "both the\nfrom the", 237, 450, 172, 94);
|
||||
NotRectText(tesseract::PSM_SINGLE_WORD, "both the\nfrom the", 237, 450, 172, 94);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,15 +9,15 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "pango_font_info.h"
|
||||
#include <pango/pango.h>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <pango/pango.h>
|
||||
#include "include_gunit.h"
|
||||
#include "absl/strings/str_cat.h" // for absl::StrCat
|
||||
#include "commandlineflags.h"
|
||||
#include "fileio.h"
|
||||
#include "pango_font_info.h"
|
||||
#include "absl/strings/str_cat.h" // for absl::StrCat
|
||||
#include "gmock/gmock-matchers.h" // for EXPECT_THAT
|
||||
#include "include_gunit.h"
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
# include "util/utf8/unicodetext.h" // for UnicodeText
|
||||
#endif
|
||||
@ -25,8 +25,7 @@
|
||||
namespace tesseract {
|
||||
|
||||
// Fonts in testdata directory
|
||||
const char* kExpectedFontNames[] = {
|
||||
"Arab",
|
||||
const char *kExpectedFontNames[] = {"Arab",
|
||||
"Arial Bold Italic",
|
||||
"DejaVu Sans Ultra-Light",
|
||||
"Lohit Hindi",
|
||||
@ -36,8 +35,7 @@ const char* kExpectedFontNames[] = {
|
||||
"Times New Roman,", // Pango v1.36.2 requires a trailing ','
|
||||
#endif
|
||||
"UnBatang",
|
||||
"Verdana"
|
||||
};
|
||||
"Verdana"};
|
||||
|
||||
// Sample text used in tests.
|
||||
const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
|
||||
@ -50,8 +48,7 @@ const char* kBadlyFormedHinWords[] = {
|
||||
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
|
||||
#endif
|
||||
// Pango v1.36.2 will render the above words even though they are invalid.
|
||||
"प्रंात", nullptr
|
||||
};
|
||||
"प्रंात", nullptr};
|
||||
|
||||
static PangoFontMap *font_map;
|
||||
|
||||
@ -135,8 +132,7 @@ TEST_F(PangoFontInfoTest, CanRenderString) {
|
||||
TEST_F(PangoFontInfoTest, CanRenderLigature) {
|
||||
font_info_.ParseFontDescriptionName("Arab 12");
|
||||
const char kArabicLigature[] = "لا";
|
||||
EXPECT_TRUE(
|
||||
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
|
||||
EXPECT_TRUE(font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
|
||||
|
||||
printf("Next word\n");
|
||||
EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
|
||||
@ -150,8 +146,8 @@ TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {
|
||||
TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
|
||||
font_info_.ParseFontDescriptionName("Lohit Hindi 12");
|
||||
for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {
|
||||
EXPECT_FALSE(font_info_.CanRenderString(kBadlyFormedHinWords[i],
|
||||
strlen(kBadlyFormedHinWords[i])))
|
||||
EXPECT_FALSE(
|
||||
font_info_.CanRenderString(kBadlyFormedHinWords[i], strlen(kBadlyFormedHinWords[i])))
|
||||
<< "Can render " << kBadlyFormedHinWords[i];
|
||||
}
|
||||
}
|
||||
@ -195,8 +191,7 @@ class FontUtilsTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
void CountUnicodeChars(const char* utf8_text,
|
||||
std::unordered_map<char32, int64_t>* ch_map) {
|
||||
void CountUnicodeChars(const char *utf8_text, std::unordered_map<char32, int64_t> *ch_map) {
|
||||
ch_map->clear();
|
||||
UnicodeText ut;
|
||||
ut.PointToUTF8(utf8_text, strlen(utf8_text));
|
||||
@ -204,7 +199,8 @@ class FontUtilsTest : public ::testing::Test {
|
||||
# if 0
|
||||
if (UnicodeProps::IsWhitespace(*it)) continue;
|
||||
# else
|
||||
if (std::isspace(*it)) continue;
|
||||
if (std::isspace(*it))
|
||||
continue;
|
||||
# endif
|
||||
++(*ch_map)[*it];
|
||||
}
|
||||
@ -271,8 +267,8 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
|
||||
SCOPED_TRACE(kLangNames[i]);
|
||||
std::vector<std::string> graphemes;
|
||||
std::string selected_font;
|
||||
EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
|
||||
&selected_font, &graphemes));
|
||||
EXPECT_TRUE(
|
||||
FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), &selected_font, &graphemes));
|
||||
EXPECT_TRUE(selected_font.size());
|
||||
EXPECT_TRUE(graphemes.size());
|
||||
}
|
||||
@ -282,8 +278,8 @@ TEST_F(FontUtilsTest, DoesFailToSelectFont) {
|
||||
const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
|
||||
std::vector<std::string> graphemes;
|
||||
std::string selected_font;
|
||||
EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
|
||||
&selected_font, &graphemes));
|
||||
EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), &selected_font,
|
||||
&graphemes));
|
||||
}
|
||||
|
||||
#if 0
|
||||
@ -331,4 +327,4 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -55,8 +55,7 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) {
|
||||
const int kCharWidth = 10;
|
||||
const int kLineSpace = 30;
|
||||
info->text = text;
|
||||
info->has_leaders =
|
||||
strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
|
||||
info->has_leaders = strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
|
||||
info->has_drop_cap = false;
|
||||
info->pix_ldistance = info->pix_rdistance = 0;
|
||||
info->average_interword_space = kCharWidth;
|
||||
@ -66,7 +65,8 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) {
|
||||
|
||||
std::vector<std::string> words = absl::StrSplit(text, ' ', absl::SkipEmpty());
|
||||
info->num_words = words.size();
|
||||
if (info->num_words < 1) return;
|
||||
if (info->num_words < 1)
|
||||
return;
|
||||
|
||||
info->lword_text = words[0].c_str();
|
||||
info->rword_text = words[words.size() - 1].c_str();
|
||||
@ -75,8 +75,7 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) {
|
||||
lspace++;
|
||||
}
|
||||
int rspace = 0;
|
||||
while (rspace < info->text.size() &&
|
||||
text[info->text.size() - rspace - 1] == ' ') {
|
||||
while (rspace < info->text.size() && text[info->text.size() - rspace - 1] == ' ') {
|
||||
rspace++;
|
||||
}
|
||||
|
||||
@ -87,20 +86,16 @@ void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) {
|
||||
int rword_width = kCharWidth * info->rword_text.size();
|
||||
info->pix_ldistance = lspace * kCharWidth;
|
||||
info->pix_rdistance = rspace * kCharWidth;
|
||||
info->lword_box =
|
||||
TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
|
||||
info->lword_box = TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
|
||||
info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,
|
||||
row_right - info->pix_rdistance, top);
|
||||
LeftWordAttributes(
|
||||
nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
|
||||
LeftWordAttributes(nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
|
||||
&info->lword_likely_starts_idea, &info->lword_likely_ends_idea);
|
||||
RightWordAttributes(
|
||||
nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
|
||||
RightWordAttributes(nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
|
||||
&info->rword_likely_starts_idea, &info->rword_likely_ends_idea);
|
||||
}
|
||||
|
||||
void MakeAsciiRowInfos(const TextAndModel* row_infos, int n,
|
||||
std::vector<RowInfo>* output) {
|
||||
void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo> *output) {
|
||||
output->clear();
|
||||
RowInfo info;
|
||||
for (int i = 0; i < n; i++) {
|
||||
@ -122,8 +117,10 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n,
|
||||
for (int i = 1; i < n; i++) {
|
||||
bool has_break = correct[i].model_type != PCONT;
|
||||
bool detected_break = (detector_output[i - 1] != detector_output[i]);
|
||||
if (has_break && !detected_break) missed_breaks++;
|
||||
if (detected_break && !has_break) incorrect_breaks++;
|
||||
if (has_break && !detected_break)
|
||||
missed_breaks++;
|
||||
if (detected_break && !has_break)
|
||||
incorrect_breaks++;
|
||||
if (has_break) {
|
||||
if (correct[i].model_type == PNONE) {
|
||||
if (detector_output[i]->model != nullptr) {
|
||||
@ -150,16 +147,15 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n,
|
||||
EXPECT_EQ(poorly_matched_models, 0);
|
||||
EXPECT_EQ(bad_list_items, 0);
|
||||
EXPECT_EQ(bad_crowns, 0);
|
||||
if (incorrect_breaks || missed_breaks || poorly_matched_models ||
|
||||
bad_list_items || bad_crowns) {
|
||||
if (incorrect_breaks || missed_breaks || poorly_matched_models || bad_list_items || bad_crowns) {
|
||||
std::vector<std::string> dbg_lines;
|
||||
dbg_lines.push_back("# ==========================");
|
||||
dbg_lines.push_back("# Correct paragraph breaks:");
|
||||
dbg_lines.push_back("# ==========================");
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (correct[i].model_type != PCONT) {
|
||||
dbg_lines.push_back(absl::StrCat(
|
||||
correct[i].ascii, " # ", correct[i].model.ToString().c_str(),
|
||||
dbg_lines.push_back(absl::StrCat(correct[i].ascii, " # ",
|
||||
correct[i].model.ToString().c_str(),
|
||||
correct[i].is_very_first_or_continuation ? " crown" : "",
|
||||
correct[i].is_list_item ? " li" : ""));
|
||||
} else {
|
||||
@ -174,8 +170,8 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n,
|
||||
std::string annotation;
|
||||
if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
|
||||
if (detector_output[i] && detector_output[i]->model) {
|
||||
annotation += absl::StrCat(
|
||||
" # ", detector_output[i]->model->ToString().c_str(),
|
||||
annotation +=
|
||||
absl::StrCat(" # ", detector_output[i]->model->ToString().c_str(),
|
||||
detector_output[i]->is_very_first_or_continuation ? " crown" : "",
|
||||
detector_output[i]->is_list_item ? " li" : "");
|
||||
} else {
|
||||
@ -196,8 +192,7 @@ void TestParagraphDetection(const TextAndModel* correct, int num_rows) {
|
||||
|
||||
MakeAsciiRowInfos(correct, num_rows, &row_infos);
|
||||
int debug_level(3);
|
||||
tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, ¶graphs,
|
||||
&models);
|
||||
tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, ¶graphs, &models);
|
||||
EvaluateParagraphDetection(correct, num_rows, row_owners);
|
||||
for (auto *model : models) {
|
||||
delete model;
|
||||
@ -242,13 +237,11 @@ const TextAndModel kTwoSimpleParagraphs[] = {
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestSimpleParagraphDetection) {
|
||||
TestParagraphDetection(kTwoSimpleParagraphs,
|
||||
countof(kTwoSimpleParagraphs));
|
||||
TestParagraphDetection(kTwoSimpleParagraphs, countof(kTwoSimpleParagraphs));
|
||||
}
|
||||
|
||||
const TextAndModel kFewCluesWithCrown[] = {
|
||||
{"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true, false},
|
||||
{"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
|
||||
{"of the page and takes two lines.", PCONT, PModel(), false, false},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"which indicates that the first ", PCONT, PModel(), false, false},
|
||||
@ -259,13 +252,11 @@ const TextAndModel kFewCluesWithCrown[] = {
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestFewCluesWithCrown) {
|
||||
TestParagraphDetection(kFewCluesWithCrown,
|
||||
countof(kFewCluesWithCrown));
|
||||
TestParagraphDetection(kFewCluesWithCrown, countof(kFewCluesWithCrown));
|
||||
}
|
||||
|
||||
const TextAndModel kCrownedParagraph[] = {
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true, false},
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
|
||||
{"often not indented as the rest ", PCONT, PModel(), false, false},
|
||||
{"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
|
||||
{"less it should be counted as the", PCONT, PModel(), false, false},
|
||||
@ -297,8 +288,7 @@ const TextAndModel kFlushLeftParagraphs[] = {
|
||||
};
|
||||
|
||||
TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
|
||||
TestParagraphDetection(kFlushLeftParagraphs,
|
||||
countof(kFlushLeftParagraphs));
|
||||
TestParagraphDetection(kFlushLeftParagraphs, countof(kFlushLeftParagraphs));
|
||||
}
|
||||
|
||||
const TextAndModel kSingleFullPageContinuation[] = {
|
||||
@ -387,8 +377,7 @@ const TextAndModel kComplexPage1[] = {
|
||||
{"from a previous page, as it is ", PCONT, PModel(), false, false},
|
||||
{"indented just like this second ", PCONT, PModel(), false, false},
|
||||
{"paragraph. ", PCONT, PModel(), false, false},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
|
||||
true, false},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},
|
||||
{" looks like the prior text ", PCONT, PModel(), false, false},
|
||||
{" but it is indented more ", PCONT, PModel(), false, false},
|
||||
{" and is fully justified. ", PCONT, PModel(), false, false},
|
||||
@ -396,22 +385,17 @@ const TextAndModel kComplexPage1[] = {
|
||||
{"centered text, block quotes, ", PCONT, PModel(), false, false},
|
||||
{"normal paragraphs, and lists ", PCONT, PModel(), false, false},
|
||||
{"like what follows? ", PCONT, PModel(), false, false},
|
||||
{"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" looking for lines where the ", PCONT, PModel(), false, false},
|
||||
{" first word of the next line ", PCONT, PModel(), false, false},
|
||||
{" would fit on the previous ", PCONT, PModel(), false, false},
|
||||
{" line. ", PCONT, PModel(), false, false},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" Python and try it out. ", PCONT, PModel(), false, false},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" mistakes. ", PCONT, PModel(), false, false},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"you can try to identify source ", PCONT, PModel(), false, false},
|
||||
{"code. Ouch! ", PCONT, PModel(), false, false},
|
||||
@ -423,8 +407,7 @@ TEST(ParagraphsTest, TestComplexPage1) {
|
||||
|
||||
// The same as above, but wider.
|
||||
const TextAndModel kComplexPage2[] = {
|
||||
{" Awesome ", PSTART,
|
||||
PModel(kCenter, 0, 0, 0, 0), false, false},
|
||||
{" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
|
||||
{" Centered Title ", PCONT, PModel(), false, false},
|
||||
{" Paragraph Detection ", PCONT, PModel(), false, false},
|
||||
{" OCR TEAM ", PCONT, PModel(), false, false},
|
||||
@ -439,8 +422,7 @@ const TextAndModel kComplexPage2[] = {
|
||||
{"from a previous page, as it is in- ", PCONT, PModel(), false, false},
|
||||
{"dented just like this second para- ", PCONT, PModel(), false, false},
|
||||
{"graph. ", PCONT, PModel(), false, false},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
|
||||
true, false},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},
|
||||
{" looks like the prior text ", PCONT, PModel(), false, false},
|
||||
{" but it is indented more ", PCONT, PModel(), false, false},
|
||||
{" and is fully justified. ", PCONT, PModel(), false, false},
|
||||
@ -448,19 +430,15 @@ const TextAndModel kComplexPage2[] = {
|
||||
{"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
|
||||
{"graphs, and lists like what follow?", PCONT, PModel(), false, false},
|
||||
{"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!!
|
||||
{"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" looking for lines where the ", PCONT, PModel(), false, false},
|
||||
{" first word of the next line ", PCONT, PModel(), false, false},
|
||||
{" would fit on the previous line. ", PCONT, PModel(), false, false},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" Python and try it out. ", PCONT, PModel(), false, false},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" mistakes. ", PCONT, PModel(), false, false},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
|
||||
{" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"you can try to identify source ", PCONT, PModel(), false, false},
|
||||
{"code. Ouch! ", PCONT, PModel(), false, false},
|
||||
@ -471,8 +449,7 @@ TEST(ParagraphsTest, TestComplexPage2) {
|
||||
}
|
||||
|
||||
const TextAndModel kSubtleCrown[] = {
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true, false},
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
|
||||
{"often not indented as the rest ", PCONT, PModel(), false, false},
|
||||
{"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
|
||||
{"less it should be counted as the", PCONT, PModel(), false, false},
|
||||
@ -493,41 +470,70 @@ TEST(ParagraphsTest, TestStrayLineInBlock) {
|
||||
const TextAndModel kUnlvRep3AO[] = {
|
||||
{" Defined contribution plans cover employees in Australia, New", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, false},
|
||||
{"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), false, false},
|
||||
{"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false},
|
||||
{"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false},
|
||||
{"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, false},
|
||||
{"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, false},
|
||||
{"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, false},
|
||||
{"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(),
|
||||
false, false},
|
||||
{"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false,
|
||||
false},
|
||||
{"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false,
|
||||
false},
|
||||
{" In addition to providing pension benefits, the Company pro- ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, false},
|
||||
{"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, false},
|
||||
{"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, false},
|
||||
{"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, false},
|
||||
{"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, false},
|
||||
{"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false},
|
||||
{"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, false},
|
||||
{" The U.S. plan covering the parent company is the largest plan.",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, false},
|
||||
{"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, false},
|
||||
{"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, false},
|
||||
{"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), false, false},
|
||||
{"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, false},
|
||||
{"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, false},
|
||||
{"credited service. The Company has the ability to change these ", PCONT, PModel(), false, false},
|
||||
{"benefits at any time. ", PCONT, PModel(), false, false},
|
||||
{"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"and life insurance benefits in the year incurred. ", PCONT, PModel(), false,
|
||||
false},
|
||||
{" The U.S. plan covering the parent company is the largest plan.", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(),
|
||||
false, false},
|
||||
{"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"credited service. The Company has the ability to change these ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"benefits at any time. ", PCONT, PModel(), false,
|
||||
false},
|
||||
{" Effective October 1993, the Company amended its health ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false},
|
||||
{"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false},
|
||||
{"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false},
|
||||
{"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, false},
|
||||
{"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false},
|
||||
{"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, false},
|
||||
{"for 1994 by approximately $83. ", PCONT, PModel(), false, false},
|
||||
{"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false,
|
||||
false},
|
||||
{"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false,
|
||||
false},
|
||||
{"for 1994 by approximately $83. ", PCONT, PModel(), false,
|
||||
false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestUnlvInsurance) {
|
||||
@ -560,8 +566,8 @@ TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
|
||||
}
|
||||
|
||||
const TextAndModel kTextWithSourceCode[] = {
|
||||
{" A typical page of a programming book may contain", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{" A typical page of a programming book may contain", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
false, false},
|
||||
{"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
|
||||
{"being described in prose. Such examples should be", PCONT, PModel(), false, false},
|
||||
{"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false},
|
||||
@ -569,8 +575,8 @@ const TextAndModel kTextWithSourceCode[] = {
|
||||
{"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
|
||||
{"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
|
||||
{"when the text is re-flowed. ", PCONT, PModel(), false, false},
|
||||
{" Let's show this by describing the function fact-", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{" Let's show this by describing the function fact-", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
false, false},
|
||||
{"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false},
|
||||
{"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false},
|
||||
{"that the typical C implementation will only work ", PCONT, PModel(), false, false},
|
||||
@ -583,80 +589,96 @@ const TextAndModel kTextWithSourceCode[] = {
|
||||
{" return n * factorial(n - 1); ", PCONT, PModel(), false, false},
|
||||
{" } ", PCONT, PModel(), false, false},
|
||||
{" ", PCONT, PModel(), false, false},
|
||||
{" The C programming language does not have built- ", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{" The C programming language does not have built- ", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
false, false},
|
||||
{"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
|
||||
{"naive implementation simply returns random values ", PCONT, PModel(), false, false},
|
||||
{"if even a moderate sized n is provided. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, NotDistractedBySourceCode) {
|
||||
TestParagraphDetection(kTextWithSourceCode,
|
||||
countof(kTextWithSourceCode));
|
||||
TestParagraphDetection(kTextWithSourceCode, countof(kTextWithSourceCode));
|
||||
}
|
||||
|
||||
const TextAndModel kOldManAndSea[] = {
|
||||
{"royal palm which are called guano and in it there was a bed, a",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false},
|
||||
{"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), false, false},
|
||||
{"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), false, false},
|
||||
{"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), false, false},
|
||||
{"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), false, false},
|
||||
{"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), false, false},
|
||||
{"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false},
|
||||
{"shirt. ", PCONT, PModel(), false, false},
|
||||
{" \"What do you have to eat?\" the boy asked. ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"A pot of yellow rice with fish. Do you want some?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"No. I will eat at home. Do you want me to make the fire?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"No. I will make it later on. Or I may eat the rice cold.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"May I take the cast net?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"Of course.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" There was no cast net and the boy remembered when they had",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), false, false},
|
||||
{"royal palm which are called guano and in it there was a bed, a", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(),
|
||||
false, false},
|
||||
{"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(),
|
||||
false, false},
|
||||
{"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(),
|
||||
false, false},
|
||||
{"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(),
|
||||
false, false},
|
||||
{"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(),
|
||||
false, false},
|
||||
{"wife on the wall but he had taken it down because it made him too", PCONT, PModel(),
|
||||
false, false},
|
||||
{"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(),
|
||||
false, false},
|
||||
{"shirt. ", PCONT, PModel(),
|
||||
false, false},
|
||||
{" \"What do you have to eat?\" the boy asked. ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"A pot of yellow rice with fish. Do you want some?\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"No. I will eat at home. Do you want me to make the fire?\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"No. I will make it later on. Or I may eat the rice cold.\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"May I take the cast net?\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"Of course.\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" There was no cast net and the boy remembered when they had", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"sold it. But they went through this fiction every day. There was no", PCONT, PModel(),
|
||||
false, false},
|
||||
{"pot of yellow rice and fish and the boy knew this too. "
|
||||
" ", PCONT, PModel(), false, false},
|
||||
{" \"Eighty-five is a lucky number,\" the old man said. \"How",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
" ",
|
||||
PCONT, PModel(), false, false},
|
||||
{" \"Eighty-five is a lucky number,\" the old man said. \"How", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"would you like to see me bring one in that dressed out over a "
|
||||
"thou-", PCONT, PModel(), false, false},
|
||||
"thou-",
|
||||
PCONT, PModel(), false, false},
|
||||
{"sand pounds? "
|
||||
" ", PCONT, PModel(), false, false},
|
||||
" ",
|
||||
PCONT, PModel(), false, false},
|
||||
{" \"I'll get the cast net and go for sardines. Will you sit in the "
|
||||
"sun",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"in the doorway?\" "
|
||||
" ", PCONT, PModel(), false, false},
|
||||
{" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" The boy did not know whether yesterday's paper was a fiction",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"too. But the old man brought it out from under the bed. ", PCONT, PModel(), false, false},
|
||||
" ",
|
||||
PCONT, PModel(), false, false},
|
||||
{" \"Yes. I have yesterday's paper and I will read the baseball.\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" The boy did not know whether yesterday's paper was a fiction", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"too. But the old man brought it out from under the bed. ", PCONT, PModel(),
|
||||
false, false},
|
||||
{" \"Pedrico gave it to me at the bodega,\" he explained. "
|
||||
" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"I'll be back when I have the sardines. I'll keep yours and mine",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"together on ice and we can share them in the morning. When I", PCONT, PModel(), false, false},
|
||||
{"come back you can tell me about the baseball.\" ", PCONT, PModel(), false, false},
|
||||
{" \"The Yankees cannot lose.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"But I fear the Indians of Cleveland.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"Have faith in the Yankees my son. Think of the great Di-",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"Maggio.\" ", PCONT, PModel(), false, false},
|
||||
{" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"land.\" ", PCONT, PModel(), false, false}
|
||||
};
|
||||
{" \"I'll be back when I have the sardines. I'll keep yours and mine", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"together on ice and we can share them in the morning. When I", PCONT, PModel(),
|
||||
false, false},
|
||||
{"come back you can tell me about the baseball.\" ", PCONT, PModel(),
|
||||
false, false},
|
||||
{" \"The Yankees cannot lose.\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"But I fear the Indians of Cleveland.\" ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"Have faith in the Yankees my son. Think of the great Di-", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"Maggio.\" ", PCONT, PModel(),
|
||||
false, false},
|
||||
{" \"I fear both the Tigers of Detroit and the Indians of Cleve-", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"land.\" ", PCONT, PModel(),
|
||||
false, false}};
|
||||
|
||||
TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
|
||||
TestParagraphDetection(kOldManAndSea, countof(kOldManAndSea));
|
||||
@ -693,8 +715,7 @@ const TextAndModel kNewZealandIndex[] = {
|
||||
{"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}
|
||||
};
|
||||
{"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}};
|
||||
|
||||
TEST(ParagraphsTest, IndexPageTest) {
|
||||
TestParagraphDetection(kNewZealandIndex, countof(kNewZealandIndex));
|
||||
@ -702,4 +723,4 @@ TEST(ParagraphsTest, IndexPageTest) {
|
||||
|
||||
// TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -72,4 +72,4 @@ TEST_F(ParamsModelTest, TestEngParamsModelIO) {
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -36,12 +36,13 @@ namespace tesseract {
|
||||
|
||||
class QuickTest : public testing::Test {
|
||||
protected:
|
||||
virtual void SetUp() { start_time_ = time(nullptr); }
|
||||
virtual void SetUp() {
|
||||
start_time_ = time(nullptr);
|
||||
}
|
||||
virtual void TearDown() {
|
||||
const time_t end_time = time(nullptr);
|
||||
EXPECT_TRUE(end_time - start_time_ <= 25)
|
||||
<< "The test took too long - "
|
||||
<< ::testing::PrintToString(end_time - start_time_);
|
||||
<< "The test took too long - " << ::testing::PrintToString(end_time - start_time_);
|
||||
}
|
||||
time_t start_time_;
|
||||
};
|
||||
@ -74,15 +75,13 @@ class NewMockProgressSink : public ClassicMockProgressSink {
|
||||
MOCK_METHOD1(progress, bool(int));
|
||||
|
||||
NewMockProgressSink() {
|
||||
monitor.progress_callback2 = [](ETEXT_DESC* ths, int, int, int,
|
||||
int) -> bool {
|
||||
monitor.progress_callback2 = [](ETEXT_DESC *ths, int, int, int, int) -> bool {
|
||||
return ((NewMockProgressSink *)ths->cancel_this)->progress(ths->progress);
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
void ClassicProgressTester(const char* imgname, const char* tessdatadir,
|
||||
const char* lang) {
|
||||
void ClassicProgressTester(const char *imgname, const char *tessdatadir, const char *lang) {
|
||||
using ::testing::_;
|
||||
using ::testing::AllOf;
|
||||
using ::testing::AtLeast;
|
||||
@ -93,8 +92,7 @@ void ClassicProgressTester(const char* imgname, const char* tessdatadir,
|
||||
using ::testing::SaveArg;
|
||||
|
||||
std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang))
|
||||
<< "Could not initialize tesseract.";
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract.";
|
||||
Pix *image = pixRead(imgname);
|
||||
ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
|
||||
api->SetImage(image);
|
||||
@ -102,13 +100,10 @@ void ClassicProgressTester(const char* imgname, const char* tessdatadir,
|
||||
ClassicMockProgressSink progressSink;
|
||||
|
||||
int currentProgress = -1;
|
||||
EXPECT_CALL(progressSink,
|
||||
classicProgress(AllOf(Gt<int&>(currentProgress), Le(100))))
|
||||
EXPECT_CALL(progressSink, classicProgress(AllOf(Gt<int &>(currentProgress), Le(100))))
|
||||
.Times(AtLeast(5))
|
||||
.WillRepeatedly(DoAll(SaveArg<0>(¤tProgress), Return(false)));
|
||||
EXPECT_CALL(progressSink, cancel(_))
|
||||
.Times(AtLeast(5))
|
||||
.WillRepeatedly(Return(false));
|
||||
EXPECT_CALL(progressSink, cancel(_)).Times(AtLeast(5)).WillRepeatedly(Return(false));
|
||||
|
||||
EXPECT_EQ(api->Recognize(&progressSink.monitor), false);
|
||||
EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%";
|
||||
@ -117,8 +112,7 @@ void ClassicProgressTester(const char* imgname, const char* tessdatadir,
|
||||
pixDestroy(&image);
|
||||
}
|
||||
|
||||
void NewProgressTester(const char* imgname, const char* tessdatadir,
|
||||
const char* lang) {
|
||||
void NewProgressTester(const char *imgname, const char *tessdatadir, const char *lang) {
|
||||
using ::testing::_;
|
||||
using ::testing::AllOf;
|
||||
using ::testing::AtLeast;
|
||||
@ -129,8 +123,7 @@ void NewProgressTester(const char* imgname, const char* tessdatadir,
|
||||
using ::testing::SaveArg;
|
||||
|
||||
std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang))
|
||||
<< "Could not initialize tesseract.";
|
||||
ASSERT_FALSE(api->Init(tessdatadir, lang)) << "Could not initialize tesseract.";
|
||||
Pix *image = pixRead(imgname);
|
||||
ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
|
||||
api->SetImage(image);
|
||||
@ -142,9 +135,7 @@ void NewProgressTester(const char* imgname, const char* tessdatadir,
|
||||
EXPECT_CALL(progressSink, progress(AllOf(Gt<int &>(currentProgress), Le(100))))
|
||||
.Times(AtLeast(5))
|
||||
.WillRepeatedly(DoAll(SaveArg<0>(¤tProgress), Return(false)));
|
||||
EXPECT_CALL(progressSink, cancel(_))
|
||||
.Times(AtLeast(5))
|
||||
.WillRepeatedly(Return(false));
|
||||
EXPECT_CALL(progressSink, cancel(_)).Times(AtLeast(5)).WillRepeatedly(Return(false));
|
||||
|
||||
EXPECT_EQ(api->Recognize(&progressSink.monitor), false);
|
||||
EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%";
|
||||
@ -154,12 +145,11 @@ void NewProgressTester(const char* imgname, const char* tessdatadir,
|
||||
}
|
||||
|
||||
TEST(QuickTest, ClassicProgressReporting) {
|
||||
ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast",
|
||||
"eng");
|
||||
ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng");
|
||||
}
|
||||
|
||||
TEST(QuickTest, NewProgressReporting) {
|
||||
NewProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng");
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,7 +9,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
@ -50,7 +49,8 @@ TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) {
|
||||
std::vector<int> vals(kRangeSize);
|
||||
CycleTimer timer;
|
||||
timer.Restart();
|
||||
for (int i = 0; i < kRangeSize; ++i) vals[i] = generator.GetVal();
|
||||
for (int i = 0; i < kRangeSize; ++i)
|
||||
vals[i] = generator.GetVal();
|
||||
LOG(INFO) << kRangeSize << "-length sequence took " << timer.GetInMs() << "ms";
|
||||
// Sort the numbers to verify that we've covered the range without repetition.
|
||||
std::sort(vals.begin(), vals.end());
|
||||
@ -66,4 +66,4 @@ TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) {
|
||||
// Run a parameterized test using the following range sizes.
|
||||
INSTANTIATE_TEST_SUITE_P(RangeTest, QRSequenceGeneratorTest,
|
||||
::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6));
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -9,17 +9,16 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "log.h" // for LOG
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "recodebeam.h"
|
||||
#include "matrix.h"
|
||||
#include "normstrngs.h"
|
||||
#include "pageres.h"
|
||||
#include "ratngs.h"
|
||||
#include "recodebeam.h"
|
||||
#include "unicharcompress.h"
|
||||
#include "normstrngs.h"
|
||||
#include "unicharset_training_utils.h"
|
||||
|
||||
#include "helpers.h"
|
||||
@ -42,14 +41,12 @@ const int kPadding = 64;
|
||||
// weak space between words and right.
|
||||
const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
|
||||
"s", "", "r", "i", "g", "h", "t", ".", nullptr};
|
||||
const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65,
|
||||
0.89, 0.99, 0.99, 0.99, 0.99, 0.95,
|
||||
0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
|
||||
const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
|
||||
0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
|
||||
const char *kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
|
||||
"S", " ", "t", "I", "9", "b", "f", ",", nullptr};
|
||||
const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25,
|
||||
0.10, 0.01, 0.01, 0.01, 0.01, 0.05,
|
||||
0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
|
||||
const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
|
||||
0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
|
||||
|
||||
const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
|
||||
const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
|
||||
@ -69,24 +66,21 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
RecodeBeamTest() : lstm_dict_(&ccutil_) {}
|
||||
~RecodeBeamTest() { lstm_dict_.End(); }
|
||||
~RecodeBeamTest() {
|
||||
lstm_dict_.End();
|
||||
}
|
||||
|
||||
// Loads and compresses the given unicharset.
|
||||
void LoadUnicharset(const std::string &unicharset_name) {
|
||||
std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR,
|
||||
"radical-stroke.txt");
|
||||
std::string unicharset_file =
|
||||
file::JoinPath(TESTDATA_DIR, unicharset_name);
|
||||
std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
|
||||
std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
|
||||
std::string radical_data;
|
||||
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
|
||||
file::Defaults()));
|
||||
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
|
||||
CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
|
||||
unichar_null_char_ = ccutil_.unicharset.has_special_codes()
|
||||
? UNICHAR_BROKEN
|
||||
: ccutil_.unicharset.size();
|
||||
unichar_null_char_ =
|
||||
ccutil_.unicharset.has_special_codes() ? UNICHAR_BROKEN : ccutil_.unicharset.size();
|
||||
STRING radical_str(radical_data.c_str());
|
||||
EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_,
|
||||
&radical_str));
|
||||
EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));
|
||||
RecodedCharID code;
|
||||
recoder_.EncodeUnichar(unichar_null_char_, &code);
|
||||
encoded_null_char_ = code(0);
|
||||
@ -102,8 +96,7 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
// Loads the dictionary.
|
||||
void LoadDict(const std::string &lang) {
|
||||
std::string traineddata_name = lang + ".traineddata";
|
||||
std::string traineddata_file =
|
||||
file::JoinPath(TESTDATA_DIR, traineddata_name);
|
||||
std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
|
||||
lstm_dict_.SetupForLoad(nullptr);
|
||||
tesseract::TessdataManager mgr;
|
||||
mgr.Init(traineddata_file.c_str());
|
||||
@ -122,17 +115,15 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
PointerVector<WERD_RES> words;
|
||||
ExpectCorrect(output, truth_utf8, nullptr, &words);
|
||||
}
|
||||
void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
|
||||
const std::string& truth_utf8, Dict* dict,
|
||||
PointerVector<WERD_RES>* words) {
|
||||
void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,
|
||||
Dict *dict, PointerVector<WERD_RES> *words) {
|
||||
RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
|
||||
beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
|
||||
// Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
|
||||
// beam_search.DebugBeams(ccutil_.unicharset);
|
||||
std::vector<int> labels, xcoords;
|
||||
beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
|
||||
LOG(INFO) << "Labels size = " << labels.size() << " coords "
|
||||
<< xcoords.size() << "\n";
|
||||
LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
|
||||
// Now decode using recoder_.
|
||||
std::string decoded;
|
||||
int end = 1;
|
||||
@ -143,12 +134,9 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
do {
|
||||
code.Set(code.length(), labels[index++]);
|
||||
uni_id = recoder_.DecodeUnichar(code);
|
||||
} while (index < labels.size() &&
|
||||
code.length() < RecodedCharID::kMaxCodeLen &&
|
||||
(uni_id == INVALID_UNICHAR_ID ||
|
||||
!recoder_.IsValidFirstCode(labels[index])));
|
||||
EXPECT_NE(INVALID_UNICHAR_ID, uni_id)
|
||||
<< "index=" << index << "/" << labels.size();
|
||||
} while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
|
||||
(uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
|
||||
EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
|
||||
// To the extent of truth_utf8, we expect decoded to match, but if
|
||||
// transcription is shorter, that is OK too, as we may just be testing
|
||||
// that we get a valid sequence when padded with random data.
|
||||
@ -161,8 +149,7 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
// Check that ExtractBestPathAsUnicharIds does the same thing.
|
||||
std::vector<int> unichar_ids;
|
||||
std::vector<float> certainties, ratings;
|
||||
beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset,
|
||||
&unichar_ids, &certainties,
|
||||
beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
|
||||
&ratings, &xcoords);
|
||||
std::string u_decoded;
|
||||
float total_rating = 0.0f;
|
||||
@ -173,10 +160,11 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
if (u_decoded.size() < truth_utf8.size()) {
|
||||
const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
|
||||
total_rating += ratings[u];
|
||||
LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u,
|
||||
unichar_ids[u], str, certainties[u],
|
||||
ratings[u], total_rating, xcoords[u]) << "\n";
|
||||
if (str[0] == ' ') total_rating = 0.0f;
|
||||
LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u, unichar_ids[u],
|
||||
str, certainties[u], ratings[u], total_rating, xcoords[u])
|
||||
<< "\n";
|
||||
if (str[0] == ' ')
|
||||
total_rating = 0.0f;
|
||||
u_decoded += str;
|
||||
}
|
||||
}
|
||||
@ -185,20 +173,20 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
// Check that ExtractBestPathAsWords does the same thing.
|
||||
TBOX line_box(0, 0, 100, 10);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
beam_search.ExtractBestPathAsWords(line_box, 1.0f, false,
|
||||
&ccutil_.unicharset, words);
|
||||
beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
|
||||
std::string w_decoded;
|
||||
for (int w = 0; w < words->size(); ++w) {
|
||||
const WERD_RES *word = (*words)[w];
|
||||
if (w_decoded.size() < truth_utf8.size()) {
|
||||
if (!w_decoded.empty() && word->word->space()) w_decoded += " ";
|
||||
if (!w_decoded.empty() && word->word->space())
|
||||
w_decoded += " ";
|
||||
w_decoded += word->best_choice->unichar_string().c_str();
|
||||
}
|
||||
LOG(INFO) << absl::StrFormat("Word:%d = %s, c=%g, r=%g, perm=%d", w,
|
||||
word->best_choice->unichar_string().c_str(),
|
||||
word->best_choice->certainty(),
|
||||
word->best_choice->rating(),
|
||||
word->best_choice->permuter()) << "\n";
|
||||
word->best_choice->certainty(), word->best_choice->rating(),
|
||||
word->best_choice->permuter())
|
||||
<< "\n";
|
||||
}
|
||||
std::string w_trunc(w_decoded.data(), truth_utf8.size());
|
||||
if (truth_utf8 != w_trunc) {
|
||||
@ -212,8 +200,8 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
}
|
||||
// Generates easy encoding of the given unichar_ids, and pads with at least
|
||||
// padding of random data.
|
||||
GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(
|
||||
const GenericVector<int>& unichar_ids, int padding) {
|
||||
GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const GenericVector<int> &unichar_ids,
|
||||
int padding) {
|
||||
int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
|
||||
int num_codes = recoder_.code_range();
|
||||
GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
|
||||
@ -242,8 +230,10 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
// Normalize the probs.
|
||||
for (int t = 0; t < width; ++t) {
|
||||
double sum = 0.0;
|
||||
for (int i = 0; i < num_codes; ++i) sum += outputs(t, i);
|
||||
for (int i = 0; i < num_codes; ++i) outputs(t, i) /= sum;
|
||||
for (int i = 0; i < num_codes; ++i)
|
||||
sum += outputs(t, i);
|
||||
for (int i = 0; i < num_codes; ++i)
|
||||
outputs(t, i) /= sum;
|
||||
}
|
||||
|
||||
return outputs;
|
||||
@ -254,8 +244,7 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
GENERIC_2D_ARRAY<float> *outputs) {
|
||||
int t = start_t;
|
||||
std::vector<int> unichar_ids;
|
||||
EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids,
|
||||
nullptr, nullptr));
|
||||
EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
|
||||
if (unichar_ids.empty() || utf8_str[0] == '\0') {
|
||||
unichar_ids.clear();
|
||||
unichar_ids.push_back(unichar_null_char_);
|
||||
@ -268,8 +257,7 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
// Apply the desired score.
|
||||
(*outputs)(t++, code(i)) = score;
|
||||
if (random != nullptr &&
|
||||
t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
|
||||
if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
|
||||
int dups = static_cast<int>(random->UnsignedRand(3.0));
|
||||
for (int d = 0; d < dups; ++d) {
|
||||
// Duplicate the desired score.
|
||||
@ -277,8 +265,7 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (random != nullptr &&
|
||||
t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
|
||||
if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
|
||||
int dups = static_cast<int>(random->UnsignedRand(3.0));
|
||||
for (int d = 0; d < dups; ++d) {
|
||||
// Add a random number of nulls as well.
|
||||
@ -292,13 +279,12 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
// uses scores1 for chars1 and scores2 for chars2, and everything else gets
|
||||
// the leftovers shared out equally. Note that empty string encodes as the
|
||||
// null_char_.
|
||||
GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char* chars1[],
|
||||
const float scores1[],
|
||||
const char* chars2[],
|
||||
const float scores2[],
|
||||
GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],
|
||||
const char *chars2[], const float scores2[],
|
||||
TRand *random) {
|
||||
int width = 0;
|
||||
while (chars1[width] != nullptr) ++width;
|
||||
while (chars1[width] != nullptr)
|
||||
++width;
|
||||
int padding = width * RecodedCharID::kMaxCodeLen;
|
||||
int num_codes = recoder_.code_range();
|
||||
GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
|
||||
@ -312,7 +298,8 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
int max_t = std::max(end_t1, end_t2);
|
||||
while (t < max_t) {
|
||||
double total_score = 0.0;
|
||||
for (int j = 0; j < num_codes; ++j) total_score += outputs(t, j);
|
||||
for (int j = 0; j < num_codes; ++j)
|
||||
total_score += outputs(t, j);
|
||||
double null_remainder = (1.0 - total_score) / 2.0;
|
||||
double remainder = null_remainder / (num_codes - 2);
|
||||
if (outputs(t, encoded_null_char_) < null_remainder) {
|
||||
@ -321,7 +308,8 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
remainder += remainder;
|
||||
}
|
||||
for (int j = 0; j < num_codes; ++j) {
|
||||
if (outputs(t, j) == 0.0f) outputs(t, j) = remainder;
|
||||
if (outputs(t, j) == 0.0f)
|
||||
outputs(t, j) = remainder;
|
||||
}
|
||||
++t;
|
||||
}
|
||||
@ -340,16 +328,17 @@ class RecodeBeamTest : public ::testing::Test {
|
||||
};
|
||||
|
||||
TEST_F(RecodeBeamTest, DoesChinese) {
|
||||
LOG(INFO) << "Testing chi_tra" << "\n";
|
||||
LOG(INFO) << "Testing chi_tra"
|
||||
<< "\n";
|
||||
LoadUnicharset("chi_tra.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
GenericVector<int> transcription;
|
||||
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
|
||||
transcription.push_back(i);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
ExpectCorrect(outputs, transcription);
|
||||
LOG(INFO) << "Testing chi_sim" << "\n";
|
||||
LOG(INFO) << "Testing chi_sim"
|
||||
<< "\n";
|
||||
LoadUnicharset("chi_sim.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
transcription.clear();
|
||||
@ -360,72 +349,74 @@ TEST_F(RecodeBeamTest, DoesChinese) {
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DoesJapanese) {
|
||||
LOG(INFO) << "Testing jpn" << "\n";
|
||||
LOG(INFO) << "Testing jpn"
|
||||
<< "\n";
|
||||
LoadUnicharset("jpn.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
GenericVector<int> transcription;
|
||||
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
|
||||
transcription.push_back(i);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
ExpectCorrect(outputs, transcription);
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DoesKorean) {
|
||||
LOG(INFO) << "Testing kor" << "\n";
|
||||
LOG(INFO) << "Testing kor"
|
||||
<< "\n";
|
||||
LoadUnicharset("kor.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
GenericVector<int> transcription;
|
||||
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
|
||||
transcription.push_back(i);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
ExpectCorrect(outputs, transcription);
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DoesKannada) {
|
||||
LOG(INFO) << "Testing kan" << "\n";
|
||||
LOG(INFO) << "Testing kan"
|
||||
<< "\n";
|
||||
LoadUnicharset("kan.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
GenericVector<int> transcription;
|
||||
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
|
||||
transcription.push_back(i);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
ExpectCorrect(outputs, transcription);
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DoesMarathi) {
|
||||
LOG(INFO) << "Testing mar" << "\n";
|
||||
LOG(INFO) << "Testing mar"
|
||||
<< "\n";
|
||||
LoadUnicharset("mar.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
GenericVector<int> transcription;
|
||||
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
|
||||
transcription.push_back(i);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
ExpectCorrect(outputs, transcription);
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DoesEnglish) {
|
||||
LOG(INFO) << "Testing eng" << "\n";
|
||||
LOG(INFO) << "Testing eng"
|
||||
<< "\n";
|
||||
LoadUnicharset("eng.unicharset");
|
||||
// Correctly reproduce the first kNumchars characters from easy output.
|
||||
GenericVector<int> transcription;
|
||||
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
|
||||
transcription.push_back(i);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
||||
ExpectCorrect(outputs, transcription);
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
|
||||
LOG(INFO) << "Testing eng dictionary" << "\n";
|
||||
LOG(INFO) << "Testing eng dictionary"
|
||||
<< "\n";
|
||||
LoadUnicharset("eng_beam.unicharset");
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
|
||||
kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
|
||||
std::string default_str;
|
||||
for (int i = 0; kGWRTops[i] != nullptr; ++i) default_str += kGWRTops[i];
|
||||
for (int i = 0; kGWRTops[i] != nullptr; ++i)
|
||||
default_str += kGWRTops[i];
|
||||
PointerVector<WERD_RES> words;
|
||||
ExpectCorrect(outputs, default_str, nullptr, &words);
|
||||
// Now try again with the dictionary.
|
||||
@ -434,10 +425,11 @@ TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
|
||||
}
|
||||
|
||||
TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
|
||||
LOG(INFO) << "Testing zh_hans dictionary" << "\n";
|
||||
LOG(INFO) << "Testing zh_hans dictionary"
|
||||
<< "\n";
|
||||
LoadUnicharset("zh_hans.unicharset");
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
|
||||
kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
|
||||
PointerVector<WERD_RES> words;
|
||||
ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
|
||||
// Each is an individual word, with permuter = top choice.
|
||||
@ -453,9 +445,8 @@ TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
|
||||
// Content of the words.
|
||||
const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
|
||||
// Permuters of the words.
|
||||
const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM,
|
||||
TOP_CHOICE_PERM, TOP_CHOICE_PERM,
|
||||
SYSTEM_DAWG_PERM};
|
||||
const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,
|
||||
TOP_CHOICE_PERM, SYSTEM_DAWG_PERM};
|
||||
EXPECT_EQ(kNumWords, words.size());
|
||||
for (int w = 0; w < kNumWords && w < words.size(); ++w) {
|
||||
EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
|
||||
@ -466,18 +457,18 @@ TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
|
||||
// Tests that a recoder built with decomposed unicode allows true ctc
|
||||
// arbitrary duplicates and inserted nulls inside the multicode sequence.
|
||||
TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
|
||||
LOG(INFO) << "Testing duplicates in multi-code sequences" << "\n";
|
||||
LOG(INFO) << "Testing duplicates in multi-code sequences"
|
||||
<< "\n";
|
||||
LoadUnicharset("vie.d.unicharset");
|
||||
tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
|
||||
TRand random;
|
||||
GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
|
||||
kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
|
||||
GENERIC_2D_ARRAY<float> outputs =
|
||||
GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
|
||||
PointerVector<WERD_RES> words;
|
||||
std::string truth_str;
|
||||
tesseract::NormalizeUTF8String(
|
||||
tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
|
||||
tesseract::NormalizeUTF8String(tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
|
||||
tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
|
||||
ExpectCorrect(outputs, truth_str, nullptr, &words);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -52,10 +52,8 @@ TEST_F(TBOXTest, OverlapFractionCorners) {
|
||||
TBOX top_left(5, 25, 15, 35);
|
||||
// other corners covered by symmetry
|
||||
|
||||
EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0),
|
||||
mid.overlap_fraction(bottom_left));
|
||||
EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0),
|
||||
bottom_left.overlap_fraction(mid));
|
||||
EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(bottom_left));
|
||||
EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), bottom_left.overlap_fraction(mid));
|
||||
EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(top_left));
|
||||
EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), top_left.overlap_fraction(mid));
|
||||
}
|
||||
@ -102,14 +100,10 @@ TEST_F(TBOXTest, OverlapFractionSpan) {
|
||||
TBOX horizontal(5, 15, 35, 25);
|
||||
// other sides covered by symmetry in other test cases
|
||||
|
||||
EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0),
|
||||
mid.overlap_fraction(vertical));
|
||||
EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0),
|
||||
vertical.overlap_fraction(mid));
|
||||
EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0),
|
||||
mid.overlap_fraction(horizontal));
|
||||
EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0),
|
||||
horizontal.overlap_fraction(mid));
|
||||
EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0), mid.overlap_fraction(vertical));
|
||||
EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0), vertical.overlap_fraction(mid));
|
||||
EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(horizontal));
|
||||
EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0), horizontal.overlap_fraction(mid));
|
||||
}
|
||||
|
||||
// TODO(nbeato): pretty much all cases
|
||||
@ -173,4 +167,4 @@ TEST_F(TBOXTest, OverlapYFractionZeroSize) {
|
||||
EXPECT_DOUBLE_EQ(0.0, small.y_overlap_fraction(zero));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -1,14 +1,14 @@
|
||||
|
||||
#include <tesseract/resultiterator.h>
|
||||
#include <string>
|
||||
#include <allheaders.h>
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/resultiterator.h>
|
||||
#include <string>
|
||||
#include "genericvector.h"
|
||||
#include "scrollview.h"
|
||||
|
||||
#include "absl/strings/str_format.h" // for absl::StrFormat
|
||||
#include "include_gunit.h"
|
||||
#include "log.h" // for LOG
|
||||
#include "absl/strings/str_format.h" // for absl::StrFormat
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -19,13 +19,15 @@ namespace tesseract {
|
||||
template <typename T>
|
||||
void ToVector(const GenericVector<T> &from, std::vector<T> *to) {
|
||||
to->clear();
|
||||
for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
|
||||
for (int i = 0; i < from.size(); i++)
|
||||
to->push_back(from[i]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ToVector(const std::vector<T> &from, std::vector<T> *to) {
|
||||
to->clear();
|
||||
for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
|
||||
for (int i = 0; i < from.size(); i++)
|
||||
to->push_back(from[i]);
|
||||
}
|
||||
|
||||
// The fixture for testing Tesseract.
|
||||
@ -42,7 +44,9 @@ class ResultIteratorTest : public testing::Test {
|
||||
return file::JoinPath(FLAGS_test_tmpdir, name);
|
||||
}
|
||||
|
||||
ResultIteratorTest() { src_pix_ = nullptr; }
|
||||
ResultIteratorTest() {
|
||||
src_pix_ = nullptr;
|
||||
}
|
||||
~ResultIteratorTest() {}
|
||||
|
||||
void SetImage(const char *filename) {
|
||||
@ -66,7 +70,8 @@ class ResultIteratorTest : public testing::Test {
|
||||
int depth = pixGetDepth(src_pix_);
|
||||
Pix *pix = pixCreate(width, height, depth);
|
||||
EXPECT_TRUE(depth == 1 || depth == 8);
|
||||
if (depth == 8) pixSetAll(pix);
|
||||
if (depth == 8)
|
||||
pixSetAll(pix);
|
||||
do {
|
||||
int left, top, right, bottom;
|
||||
PageIteratorLevel im_level = level;
|
||||
@ -75,18 +80,17 @@ class ResultIteratorTest : public testing::Test {
|
||||
im_level = tesseract::RIL_BLOCK;
|
||||
EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom));
|
||||
}
|
||||
LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right
|
||||
<< ", B:" << bottom << "]" << "\n";
|
||||
LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right << ", B:" << bottom
|
||||
<< "]"
|
||||
<< "\n";
|
||||
Pix *block_pix;
|
||||
if (depth == 1) {
|
||||
block_pix = it->GetBinaryImage(im_level);
|
||||
pixRasterop(pix, left, top, right - left, bottom - top,
|
||||
PIX_SRC ^ PIX_DST, block_pix, 0, 0);
|
||||
pixRasterop(pix, left, top, right - left, bottom - top, PIX_SRC ^ PIX_DST, block_pix, 0, 0);
|
||||
} else {
|
||||
block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top);
|
||||
pixRasterop(pix, left, top, pixGetWidth(block_pix),
|
||||
pixGetHeight(block_pix), PIX_SRC & PIX_DST, block_pix, 0,
|
||||
0);
|
||||
pixRasterop(pix, left, top, pixGetWidth(block_pix), pixGetHeight(block_pix),
|
||||
PIX_SRC & PIX_DST, block_pix, 0, 0);
|
||||
}
|
||||
CHECK(block_pix != nullptr);
|
||||
pixDestroy(&block_pix);
|
||||
@ -117,8 +121,7 @@ class ResultIteratorTest : public testing::Test {
|
||||
|
||||
// Rebuilds the text from the iterator strings at the given level, and
|
||||
// EXPECTs that the rebuild string exactly matches the truth string.
|
||||
void VerifyIteratorText(const std::string& truth, PageIteratorLevel level,
|
||||
ResultIterator* it) {
|
||||
void VerifyIteratorText(const std::string &truth, PageIteratorLevel level, ResultIterator *it) {
|
||||
LOG(INFO) << "Text Test Level " << level << "\n";
|
||||
it->Begin();
|
||||
std::string result;
|
||||
@ -138,12 +141,11 @@ class ResultIteratorTest : public testing::Test {
|
||||
result += '\n';
|
||||
}
|
||||
} while (it->Next(level));
|
||||
EXPECT_STREQ(truth.c_str(), result.c_str())
|
||||
<< "Rebuild failed at Text Level " << level;
|
||||
EXPECT_STREQ(truth.c_str(), result.c_str()) << "Rebuild failed at Text Level " << level;
|
||||
}
|
||||
|
||||
void VerifyRebuilds(int block_limit, int para_limit, int line_limit,
|
||||
int word_limit, int symbol_limit, PageIterator* it) {
|
||||
void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_limit,
|
||||
int symbol_limit, PageIterator *it) {
|
||||
VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);
|
||||
VerifyRebuild(para_limit, tesseract::RIL_PARA, it);
|
||||
VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);
|
||||
@ -164,8 +166,7 @@ class ResultIteratorTest : public testing::Test {
|
||||
// expected output reading order
|
||||
// (expected_reading_order[num_reading_order_entries]) and a given reading
|
||||
// context (ltr or rtl).
|
||||
void ExpectTextlineReadingOrder(bool in_ltr_context,
|
||||
const StrongScriptDirection* word_dirs,
|
||||
void ExpectTextlineReadingOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs,
|
||||
int num_words, int *expected_reading_order,
|
||||
int num_reading_order_entries) const {
|
||||
std::vector<StrongScriptDirection> gv_word_dirs;
|
||||
@ -174,11 +175,9 @@ class ResultIteratorTest : public testing::Test {
|
||||
}
|
||||
|
||||
std::vector<int> output;
|
||||
ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
|
||||
&output);
|
||||
ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &output);
|
||||
// STL vector can be used with EXPECT_EQ, so convert...
|
||||
std::vector<int> correct_order(
|
||||
expected_reading_order,
|
||||
std::vector<int> correct_order(expected_reading_order,
|
||||
expected_reading_order + num_reading_order_entries);
|
||||
std::vector<int> calculated_order;
|
||||
ToVector(output, &calculated_order);
|
||||
@ -189,8 +188,7 @@ class ResultIteratorTest : public testing::Test {
|
||||
// for a given array of word_dirs[num_words] in ltr or rtl context.
|
||||
// Sane means that the output contains some permutation of the indices
|
||||
// 0..[num_words - 1] interspersed optionally with negative (marker) values.
|
||||
void VerifySaneTextlineOrder(bool in_ltr_context,
|
||||
const StrongScriptDirection* word_dirs,
|
||||
void VerifySaneTextlineOrder(bool in_ltr_context, const StrongScriptDirection *word_dirs,
|
||||
int num_words) const {
|
||||
std::vector<StrongScriptDirection> gv_word_dirs;
|
||||
for (int i = 0; i < num_words; i++) {
|
||||
@ -198,14 +196,14 @@ class ResultIteratorTest : public testing::Test {
|
||||
}
|
||||
|
||||
std::vector<int> output;
|
||||
ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
|
||||
&output);
|
||||
ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, &output);
|
||||
ASSERT_GE(output.size(), num_words);
|
||||
std::vector<int> output_copy(output);
|
||||
std::sort(output_copy.begin(), output_copy.end());
|
||||
bool sane = true;
|
||||
int j = 0;
|
||||
while (j < output_copy.size() && output_copy[j] < 0) j++;
|
||||
while (j < output_copy.size() && output_copy[j] < 0)
|
||||
j++;
|
||||
for (int i = 0; i < num_words; i++, j++) {
|
||||
if (output_copy[j] != i) {
|
||||
sane = false;
|
||||
@ -218,8 +216,7 @@ class ResultIteratorTest : public testing::Test {
|
||||
if (!sane) {
|
||||
std::vector<int> output_copy2, empty;
|
||||
ToVector(output, &output_copy2);
|
||||
EXPECT_EQ(output_copy2, empty)
|
||||
<< " permutation of 0.." << num_words - 1 << " not found in "
|
||||
EXPECT_EQ(output_copy2, empty) << " permutation of 0.." << num_words - 1 << " not found in "
|
||||
<< (in_ltr_context ? "ltr" : "rtl") << " context.";
|
||||
}
|
||||
}
|
||||
@ -285,7 +282,8 @@ TEST_F(ResultIteratorTest, EasyTest) {
|
||||
EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
|
||||
|
||||
// The images should rebuild almost perfectly.
|
||||
LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" << "\n";
|
||||
LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)"
|
||||
<< "\n";
|
||||
VerifyRebuilds(10, 10, 0, 0, 0, p_it);
|
||||
delete p_it;
|
||||
|
||||
@ -294,14 +292,17 @@ TEST_F(ResultIteratorTest, EasyTest) {
|
||||
delete[] result;
|
||||
ResultIterator *r_it = api_.GetIterator();
|
||||
// The images should rebuild almost perfectly.
|
||||
LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" << "\n";
|
||||
LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)"
|
||||
<< "\n";
|
||||
VerifyRebuilds(8, 8, 0, 0, 40, r_it);
|
||||
// Test the text.
|
||||
LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" << "\n";
|
||||
LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)"
|
||||
<< "\n";
|
||||
VerifyAllText(ocr_text_, r_it);
|
||||
|
||||
// The images should rebuild almost perfectly.
|
||||
LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" << "\n";
|
||||
LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)"
|
||||
<< "\n";
|
||||
VerifyRebuilds(8, 8, 0, 0, 40, r_it);
|
||||
|
||||
r_it->Begin();
|
||||
@ -325,14 +326,14 @@ TEST_F(ResultIteratorTest, EasyTest) {
|
||||
do {
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int pointsize, font_id;
|
||||
const char* font =
|
||||
r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
|
||||
&serif, &smallcaps, &pointsize, &font_id);
|
||||
const char *font = r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
|
||||
&smallcaps, &pointsize, &font_id);
|
||||
float confidence = r_it->Confidence(tesseract::RIL_WORD);
|
||||
EXPECT_GE(confidence, 80.0f);
|
||||
char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
|
||||
LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g",
|
||||
word_str, font, font_id, pointsize, confidence) << "\n";
|
||||
LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g", word_str, font,
|
||||
font_id, pointsize, confidence)
|
||||
<< "\n";
|
||||
delete[] word_str;
|
||||
EXPECT_FALSE(bold);
|
||||
EXPECT_FALSE(italic);
|
||||
@ -383,32 +384,30 @@ TEST_F(ResultIteratorTest, SmallCapDropCapTest) {
|
||||
do {
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int pointsize, font_id;
|
||||
r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
|
||||
&smallcaps, &pointsize, &font_id);
|
||||
r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
|
||||
&pointsize, &font_id);
|
||||
char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
|
||||
if (word_str != nullptr) {
|
||||
LOG(INFO) << absl::StrFormat("Word %s is %s", word_str,
|
||||
smallcaps ? "SMALLCAPS" : "Normal") << "\n";
|
||||
LOG(INFO) << absl::StrFormat("Word %s is %s", word_str, smallcaps ? "SMALLCAPS" : "Normal")
|
||||
<< "\n";
|
||||
if (r_it->SymbolIsDropcap()) {
|
||||
++found_dropcaps;
|
||||
}
|
||||
if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 ||
|
||||
strcmp(word_str, "RALPH") == 0 ||
|
||||
strcmp(word_str, "KINNEY") == 0 || // Not working yet.
|
||||
strcmp(word_str, "RALPH") == 0 || strcmp(word_str, "KINNEY") == 0 || // Not working yet.
|
||||
strcmp(word_str, "BENNETT") == 0) {
|
||||
EXPECT_TRUE(smallcaps) << word_str;
|
||||
++found_smallcaps;
|
||||
} else {
|
||||
if (smallcaps) ++false_positives;
|
||||
if (smallcaps)
|
||||
++false_positives;
|
||||
}
|
||||
// No symbol other than the first of any word should be dropcap.
|
||||
ResultIterator s_it(*r_it);
|
||||
while (s_it.Next(tesseract::RIL_SYMBOL) &&
|
||||
!s_it.IsAtBeginningOf(tesseract::RIL_WORD)) {
|
||||
while (s_it.Next(tesseract::RIL_SYMBOL) && !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) {
|
||||
if (s_it.SymbolIsDropcap()) {
|
||||
char *sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
|
||||
LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str,
|
||||
word_str);
|
||||
LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str, word_str);
|
||||
delete[] sym_str;
|
||||
}
|
||||
EXPECT_FALSE(s_it.SymbolIsDropcap());
|
||||
@ -480,17 +479,12 @@ TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) {
|
||||
const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR};
|
||||
int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart,
|
||||
0, 1, 2, 3, ResultIterator::kMinorRunEnd};
|
||||
int reading_order_ltr_context[] = {0, 1,
|
||||
2, 3,
|
||||
4, ResultIterator::kMinorRunStart,
|
||||
7, 6,
|
||||
5, ResultIterator::kMinorRunEnd};
|
||||
int reading_order_ltr_context[] = {
|
||||
0, 1, 2, 3, 4, ResultIterator::kMinorRunStart, 7, 6, 5, ResultIterator::kMinorRunEnd};
|
||||
|
||||
ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs),
|
||||
reading_order_ltr_context,
|
||||
ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context,
|
||||
countof(reading_order_ltr_context));
|
||||
ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs),
|
||||
reading_order_rtl_context,
|
||||
ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,
|
||||
countof(reading_order_rtl_context));
|
||||
}
|
||||
|
||||
@ -502,15 +496,12 @@ TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) {
|
||||
int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7};
|
||||
// In the strange event that this shows up in an RTL paragraph, nonetheless
|
||||
// just presume the whole thing is an LTR line.
|
||||
int reading_order_rtl_context[] = {
|
||||
ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
int reading_order_rtl_context[] = {ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,
|
||||
ResultIterator::kMinorRunEnd};
|
||||
|
||||
ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs),
|
||||
reading_order_ltr_context,
|
||||
ExpectTextlineReadingOrder(true, word_dirs, countof(word_dirs), reading_order_ltr_context,
|
||||
countof(reading_order_ltr_context));
|
||||
ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs),
|
||||
reading_order_rtl_context,
|
||||
ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,
|
||||
countof(reading_order_rtl_context));
|
||||
}
|
||||
|
||||
@ -520,8 +511,7 @@ TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) {
|
||||
const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR};
|
||||
// The order here is just right-to-left, nothing fancy.
|
||||
int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0};
|
||||
ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs),
|
||||
reading_order_rtl_context,
|
||||
ExpectTextlineReadingOrder(false, word_dirs, countof(word_dirs), reading_order_rtl_context,
|
||||
countof(reading_order_rtl_context));
|
||||
}
|
||||
|
||||
@ -560,13 +550,13 @@ TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) {
|
||||
do {
|
||||
const char *char_str = c_it.GetUTF8Text();
|
||||
if (char_str == nullptr)
|
||||
LOG(INFO) << "Null char choice" << "\n";
|
||||
LOG(INFO) << "Null char choice"
|
||||
<< "\n";
|
||||
else
|
||||
LOG(INFO) << "Char choice " << char_str << "\n";
|
||||
CHECK(char_str != nullptr);
|
||||
} while (c_it.Next());
|
||||
} while (
|
||||
!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
|
||||
} while (!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
|
||||
s_it.Next(tesseract::RIL_SYMBOL));
|
||||
delete[] word_str;
|
||||
}
|
||||
@ -595,18 +585,17 @@ TEST_F(ResultIteratorTest, NonNullConfidencesTest) {
|
||||
const char *char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
|
||||
CHECK(char_str != nullptr);
|
||||
float confidence = s_it.Confidence(tesseract::RIL_SYMBOL);
|
||||
LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str,
|
||||
confidence);
|
||||
LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str, confidence);
|
||||
delete[] char_str;
|
||||
} while (
|
||||
!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
|
||||
} while (!s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
|
||||
s_it.Next(tesseract::RIL_SYMBOL));
|
||||
delete[] word_str;
|
||||
} else {
|
||||
LOG(INFO) << "Empty word found" << "\n";
|
||||
LOG(INFO) << "Empty word found"
|
||||
<< "\n";
|
||||
}
|
||||
} while (r_it->Next(tesseract::RIL_WORD));
|
||||
delete r_it;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -18,8 +18,7 @@ namespace tesseract {
|
||||
|
||||
class ScanutilsTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
}
|
||||
void SetUp() override {}
|
||||
};
|
||||
|
||||
TEST_F(ScanutilsTest, DoesScanf) {
|
||||
@ -95,7 +94,8 @@ TEST_F(ScanutilsTest, DoesScanf) {
|
||||
r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]);
|
||||
r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]);
|
||||
EXPECT_EQ(r1, r2);
|
||||
for (int i = 0; i < kNumFloats; ++i) EXPECT_FLOAT_EQ(f1[i], f2[i]);
|
||||
for (int i = 0; i < kNumFloats; ++i)
|
||||
EXPECT_FLOAT_EQ(f1[i], f2[i]);
|
||||
// Test the * for field suppression.
|
||||
r1 = fscanf(fp1, "%d %*s %*d %*f %*f", &i1[0]);
|
||||
r2 = tfscanf(fp2, "%d %*s %*d %*f %*f", &i2[0]);
|
||||
@ -111,4 +111,4 @@ TEST_F(ScanutilsTest, DoesScanf) {
|
||||
fclose(fp1);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -179,4 +179,4 @@ TEST_F(ShapeTableTest, FullTest) {
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -56,4 +56,4 @@ TEST_F(STATSTest, TopNModes) {
|
||||
EXPECT_EQ(6, modes[2].data());
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
} // namespace tesseract
|
||||
|
@ -27,13 +27,11 @@ class Array2D : public std::vector<T> {
|
||||
public:
|
||||
Array2D() : std::vector<T>(std::vector<int64_t>{0, 0}) {}
|
||||
|
||||
Array2D(const int64_t n1, const int64_t n2)
|
||||
: std::vector<T>(std::vector<int64_t>{n1, n2}) {}
|
||||
Array2D(const int64_t n1, const int64_t n2) : std::vector<T>(std::vector<int64_t>{n1, n2}) {}
|
||||
|
||||
Array2D(const int64_t n1, const int64_t n2, const T value)
|
||||
: std::vector<T>({n1, n2}, value) {}
|
||||
Array2D(const int64_t n1, const int64_t n2, const T value) : std::vector<T>({n1, n2}, value) {}
|
||||
};
|
||||
}
|
||||
} // namespace xla
|
||||
#endif
|
||||
|
||||
class StridemapTest : public ::testing::Test {
|
||||
@ -81,16 +79,12 @@ TEST_F(StridemapTest, Indexing) {
|
||||
int pos = 0;
|
||||
do {
|
||||
EXPECT_GE(index.t(), pos);
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH)),
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),
|
||||
pos);
|
||||
EXPECT_EQ(index.IsLast(FD_BATCH),
|
||||
index.index(FD_BATCH) == arrays.size() - 1);
|
||||
EXPECT_EQ(
|
||||
index.IsLast(FD_HEIGHT),
|
||||
EXPECT_EQ(index.IsLast(FD_BATCH), index.index(FD_BATCH) == arrays.size() - 1);
|
||||
EXPECT_EQ(index.IsLast(FD_HEIGHT),
|
||||
index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1);
|
||||
EXPECT_EQ(
|
||||
index.IsLast(FD_WIDTH),
|
||||
EXPECT_EQ(index.IsLast(FD_WIDTH),
|
||||
index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1);
|
||||
EXPECT_TRUE(index.IsValid());
|
||||
++pos;
|
||||
@ -100,8 +94,7 @@ TEST_F(StridemapTest, Indexing) {
|
||||
do {
|
||||
--pos;
|
||||
EXPECT_GE(index.t(), pos);
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH)),
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),
|
||||
pos);
|
||||
StrideMap::Index copy(index);
|
||||
// Since a change in batch index changes the height and width, it isn't
|
||||
@ -151,33 +144,29 @@ TEST_F(StridemapTest, Scaling) {
|
||||
stride_map.SetStride(h_w_sizes);
|
||||
|
||||
// Scale x by 2, keeping y the same.
|
||||
std::vector<int> values_x2 = {0, 1, 4, 5, 8, 9, 12, 13, 17, 18,
|
||||
22, 23, 27, 28, 32, 33, 36, 37, 40, 41,
|
||||
44, 45, 48, 49, 53, 54, 58, 59};
|
||||
std::vector<int> values_x2 = {0, 1, 4, 5, 8, 9, 12, 13, 17, 18, 22, 23, 27, 28,
|
||||
32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 53, 54, 58, 59};
|
||||
StrideMap test_map(stride_map);
|
||||
test_map.ScaleXY(2, 1);
|
||||
StrideMap::Index index(test_map);
|
||||
int pos = 0;
|
||||
do {
|
||||
int expected_value = values_x2[pos++];
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH)),
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),
|
||||
expected_value);
|
||||
} while (index.Increment());
|
||||
EXPECT_EQ(pos, values_x2.size());
|
||||
|
||||
test_map = stride_map;
|
||||
// Scale y by 2, keeping x the same.
|
||||
std::vector<int> values_y2 = {0, 1, 2, 3, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 32, 33, 34, 35,
|
||||
36, 37, 38, 39, 48, 49, 50, 51, 52};
|
||||
std::vector<int> values_y2 = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52};
|
||||
test_map.ScaleXY(1, 2);
|
||||
index.InitToFirst();
|
||||
pos = 0;
|
||||
do {
|
||||
int expected_value = values_y2[pos++];
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH)),
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),
|
||||
expected_value);
|
||||
} while (index.Increment());
|
||||
EXPECT_EQ(pos, values_y2.size());
|
||||
@ -190,23 +179,20 @@ TEST_F(StridemapTest, Scaling) {
|
||||
pos = 0;
|
||||
do {
|
||||
int expected_value = values_xy2[pos++];
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH)),
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),
|
||||
expected_value);
|
||||
} while (index.Increment());
|
||||
EXPECT_EQ(pos, values_xy2.size());
|
||||
|
||||
test_map = stride_map;
|
||||
// Reduce Width to 1.
|
||||
std::vector<int> values_x_to_1 = {0, 4, 8, 12, 17, 22, 27,
|
||||
32, 36, 40, 44, 48, 53, 58};
|
||||
std::vector<int> values_x_to_1 = {0, 4, 8, 12, 17, 22, 27, 32, 36, 40, 44, 48, 53, 58};
|
||||
test_map.ReduceWidthTo1();
|
||||
index.InitToFirst();
|
||||
pos = 0;
|
||||
do {
|
||||
int expected_value = values_x_to_1[pos++];
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
|
||||
index.index(FD_WIDTH)),
|
||||
EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), index.index(FD_WIDTH)),
|
||||
expected_value);
|
||||
} while (index.Increment());
|
||||
EXPECT_EQ(pos, values_x_to_1.size());
|
||||
@ -216,4 +202,4 @@ TEST_F(StridemapTest, Scaling) {
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -17,8 +17,8 @@
|
||||
#include "stringrenderer.h"
|
||||
#include "strngs.h"
|
||||
|
||||
#include "absl/strings/str_split.h" // for absl::StrSplit
|
||||
#include <allheaders.h>
|
||||
#include "absl/strings/str_split.h" // for absl::StrSplit
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
@ -63,7 +63,8 @@ class StringRendererTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
void DisplayClusterBoxes(Pix *pix) {
|
||||
if (!FLAGS_display) return;
|
||||
if (!FLAGS_display)
|
||||
return;
|
||||
const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();
|
||||
Boxa *boxes = boxaCreate(0);
|
||||
for (const auto &boxchar : boxchars) {
|
||||
@ -81,31 +82,27 @@ class StringRendererTest : public ::testing::Test {
|
||||
TEST_F(StringRendererTest, DoesRenderToImage) {
|
||||
renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
pixDestroy(&pix);
|
||||
|
||||
renderer_.reset(new StringRenderer("UnBatang 10", 600, 600));
|
||||
EXPECT_EQ(strlen(kKorText),
|
||||
renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
|
||||
EXPECT_EQ(strlen(kKorText), renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
pixDestroy(&pix);
|
||||
|
||||
renderer_.reset(new StringRenderer("Lohit Hindi 10", 600, 600));
|
||||
EXPECT_EQ(strlen(kHinText),
|
||||
renderer_->RenderToImage(kHinText, strlen(kHinText), &pix));
|
||||
EXPECT_EQ(strlen(kHinText), renderer_->RenderToImage(kHinText, strlen(kHinText), &pix));
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
pixDestroy(&pix);
|
||||
|
||||
// RTL text
|
||||
renderer_.reset(new StringRenderer("Arab 10", 600, 600));
|
||||
EXPECT_EQ(strlen(kArabicText),
|
||||
renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix));
|
||||
EXPECT_EQ(strlen(kArabicText), renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
@ -113,8 +110,7 @@ TEST_F(StringRendererTest, DoesRenderToImage) {
|
||||
|
||||
// Mixed direction Arabic + english text
|
||||
renderer_.reset(new StringRenderer("Arab 10", 600, 600));
|
||||
EXPECT_EQ(strlen(kMixedText),
|
||||
renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix));
|
||||
EXPECT_EQ(strlen(kMixedText), renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
@ -127,8 +123,7 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) {
|
||||
renderer_->set_underline_start_prob(1.0);
|
||||
renderer_->set_underline_continuation_prob(0);
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
@ -138,8 +133,7 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) {
|
||||
// Underline all words AND intervening spaces.
|
||||
renderer_->set_underline_start_prob(1.0);
|
||||
renderer_->set_underline_continuation_prob(1.0);
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
@ -149,8 +143,7 @@ TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) {
|
||||
// Underline words and intervening spaces with 0.5 prob.
|
||||
renderer_->set_underline_start_prob(0.5);
|
||||
renderer_->set_underline_continuation_prob(0.5);
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
@ -162,8 +155,7 @@ TEST_F(StringRendererTest, DoesHandleNewlineCharacters) {
|
||||
const char kStrippedText[] = " A B C "; // text with newline chars removed
|
||||
renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kRawText),
|
||||
renderer_->RenderToImage(kRawText, strlen(kRawText), &pix));
|
||||
EXPECT_EQ(strlen(kRawText), renderer_->RenderToImage(kRawText, strlen(kRawText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();
|
||||
// 3 characters + 4 spaces => 7 boxes
|
||||
@ -183,8 +175,7 @@ TEST_F(StringRendererTest, DoesRenderLigatures) {
|
||||
const char kArabicLigature[] = "لا";
|
||||
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(
|
||||
strlen(kArabicLigature),
|
||||
EXPECT_EQ(strlen(kArabicLigature),
|
||||
renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
@ -202,10 +193,10 @@ TEST_F(StringRendererTest, DoesRenderLigatures) {
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
|
||||
static int FindBoxCharXCoord(const std::vector<BoxChar*>& boxchars,
|
||||
const std::string& ch) {
|
||||
static int FindBoxCharXCoord(const std::vector<BoxChar *> &boxchars, const std::string &ch) {
|
||||
for (const auto &boxchar : boxchars) {
|
||||
if (boxchar->ch() == ch) return boxchar->box()->x;
|
||||
if (boxchar->ch() == ch)
|
||||
return boxchar->box()->x;
|
||||
}
|
||||
return INT_MAX;
|
||||
}
|
||||
@ -221,8 +212,7 @@ TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) {
|
||||
// Decode to get the box text strings.
|
||||
EXPECT_FALSE(boxes_str.empty());
|
||||
std::vector<STRING> texts;
|
||||
EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts,
|
||||
nullptr, nullptr));
|
||||
EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts, nullptr, nullptr));
|
||||
std::string ltr_str;
|
||||
for (size_t i = 0; i < texts.size(); ++i) {
|
||||
ltr_str += texts[i].c_str();
|
||||
@ -245,8 +235,7 @@ TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) {
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();
|
||||
for (size_t i = 1; i < boxchars.size(); ++i) {
|
||||
EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x)
|
||||
<< boxchars[i - 1]->ch();
|
||||
EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) << boxchars[i - 1]->ch();
|
||||
}
|
||||
pixDestroy(&pix);
|
||||
|
||||
@ -256,8 +245,7 @@ TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) {
|
||||
renderer_->RenderToImage(kEnglishWord, strlen(kEnglishWord), &pix);
|
||||
EXPECT_EQ(boxchars.size(), strlen(kEnglishWord));
|
||||
for (size_t i = 1; i < boxchars.size(); ++i) {
|
||||
EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x)
|
||||
<< boxchars[i - 1]->ch();
|
||||
EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) << boxchars[i - 1]->ch();
|
||||
}
|
||||
pixDestroy(&pix);
|
||||
|
||||
@ -274,8 +262,7 @@ TEST_F(StringRendererTest, DoesRenderVerticalText) {
|
||||
Pix *pix = nullptr;
|
||||
renderer_.reset(new StringRenderer("UnBatang 10", 600, 600));
|
||||
renderer_->set_vertical_text(true);
|
||||
EXPECT_EQ(strlen(kKorText),
|
||||
renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
|
||||
EXPECT_EQ(strlen(kKorText), renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
DisplayClusterBoxes(pix);
|
||||
pixDestroy(&pix);
|
||||
@ -289,8 +276,7 @@ TEST_F(StringRendererTest, DoesKeepAllImageBoxes) {
|
||||
int num_boxes_per_page = 0;
|
||||
const int kNumTrials = 2;
|
||||
for (int i = 0; i < kNumTrials; ++i) {
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
pixDestroy(&pix);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
@ -299,8 +285,7 @@ TEST_F(StringRendererTest, DoesKeepAllImageBoxes) {
|
||||
} else {
|
||||
EXPECT_EQ((i + 1) * num_boxes_per_page, renderer_->GetBoxes().size());
|
||||
}
|
||||
for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page;
|
||||
++j) {
|
||||
for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page; ++j) {
|
||||
EXPECT_EQ(i, renderer_->GetBoxes()[j]->page());
|
||||
}
|
||||
}
|
||||
@ -309,15 +294,13 @@ TEST_F(StringRendererTest, DoesKeepAllImageBoxes) {
|
||||
TEST_F(StringRendererTest, DoesClearBoxes) {
|
||||
renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
pixDestroy(&pix);
|
||||
EXPECT_GT(renderer_->GetBoxes().size(), 0);
|
||||
const int num_boxes_per_page = renderer_->GetBoxes().size();
|
||||
|
||||
renderer_->ClearBoxes();
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
pixDestroy(&pix);
|
||||
EXPECT_EQ(num_boxes_per_page, renderer_->GetBoxes().size());
|
||||
}
|
||||
@ -327,8 +310,7 @@ TEST_F(StringRendererTest, DoesLigatureTextForRendering) {
|
||||
renderer_->set_add_ligatures(true);
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kEngNonLigatureText),
|
||||
renderer_->RenderToImage(kEngNonLigatureText,
|
||||
strlen(kEngNonLigatureText), &pix));
|
||||
renderer_->RenderToImage(kEngNonLigatureText, strlen(kEngNonLigatureText), &pix));
|
||||
pixDestroy(&pix);
|
||||
// There should be one less box than letters due to the 'fi' ligature.
|
||||
EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());
|
||||
@ -340,8 +322,7 @@ TEST_F(StringRendererTest, DoesRetainInputLigatureForRendering) {
|
||||
renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kEngLigatureText),
|
||||
renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText),
|
||||
&pix));
|
||||
renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText), &pix));
|
||||
pixDestroy(&pix);
|
||||
// There should be one less box than letters due to the 'fi' ligature.
|
||||
EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());
|
||||
@ -362,12 +343,10 @@ TEST_F(StringRendererTest, DoesRenderWordBoxes) {
|
||||
renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
|
||||
renderer_->set_output_word_boxes(true);
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_EQ(strlen(kEngText),
|
||||
renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kEngText), renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
|
||||
pixDestroy(&pix);
|
||||
// Verify #boxchars = #words + #spaces
|
||||
std::vector<std::string> words =
|
||||
absl::StrSplit(kEngText, ' ', absl::SkipEmpty());
|
||||
std::vector<std::string> words = absl::StrSplit(kEngText, ' ', absl::SkipEmpty());
|
||||
const int kNumSpaces = words.size() - 1;
|
||||
const int kExpectedNumBoxes = words.size() + kNumSpaces;
|
||||
const std::vector<BoxChar *> &boxchars = renderer_->GetBoxes();
|
||||
@ -387,8 +366,7 @@ TEST_F(StringRendererTest, DoesRenderWordBoxesFromMultiLineText) {
|
||||
renderer_->set_output_word_boxes(true);
|
||||
Pix *pix = nullptr;
|
||||
const char kMultlineText[] = "the quick brown fox\njumps over the lazy dog";
|
||||
EXPECT_EQ(strlen(kMultlineText),
|
||||
renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix));
|
||||
EXPECT_EQ(strlen(kMultlineText), renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix));
|
||||
pixDestroy(&pix);
|
||||
// Verify #boxchars = #words + #spaces + #newlines
|
||||
std::vector<std::string> words =
|
||||
@ -414,13 +392,14 @@ TEST_F(StringRendererTest, DoesRenderAllFontsToImage) {
|
||||
do {
|
||||
Pix *pix = nullptr;
|
||||
font_used.clear();
|
||||
offset += renderer_->RenderAllFontsToImage(
|
||||
1.0, kEngText + offset, strlen(kEngText + offset), &font_used, &pix);
|
||||
offset += renderer_->RenderAllFontsToImage(1.0, kEngText + offset, strlen(kEngText + offset),
|
||||
&font_used, &pix);
|
||||
if (offset < strlen(kEngText)) {
|
||||
EXPECT_TRUE(pix != nullptr);
|
||||
EXPECT_STRNE("", font_used.c_str());
|
||||
}
|
||||
if (FLAGS_display) pixDisplay(pix, 0, 0);
|
||||
if (FLAGS_display)
|
||||
pixDisplay(pix, 0, 0);
|
||||
pixDestroy(&pix);
|
||||
} while (offset < strlen(kEngText));
|
||||
}
|
||||
@ -447,8 +426,7 @@ TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) {
|
||||
const std::string kWord = "office";
|
||||
const std::string kCleanWord = "oice";
|
||||
Pix *pix = nullptr;
|
||||
EXPECT_FALSE(
|
||||
renderer_->font().CanRenderString(kWord.c_str(), kWord.length()));
|
||||
EXPECT_FALSE(renderer_->font().CanRenderString(kWord.c_str(), kWord.length()));
|
||||
EXPECT_FALSE(renderer_->font().CoversUTF8Text(kWord.c_str(), kWord.length()));
|
||||
int offset = renderer_->RenderToImage(kWord.c_str(), kWord.length(), &pix);
|
||||
pixDestroy(&pix);
|
||||
@ -465,50 +443,40 @@ TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) {
|
||||
TEST(ConvertBasicLatinToFullwidthLatinTest, DoesConvertBasicLatin) {
|
||||
const std::string kHalfAlpha = "ABCD";
|
||||
const std::string kFullAlpha = "ABCD";
|
||||
EXPECT_EQ(kFullAlpha,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha));
|
||||
EXPECT_EQ(kFullAlpha, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha));
|
||||
|
||||
const std::string kHalfDigit = "0123";
|
||||
const std::string kFullDigit = "0123";
|
||||
EXPECT_EQ(kFullDigit,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit));
|
||||
EXPECT_EQ(kFullDigit, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit));
|
||||
|
||||
const std::string kHalfSym = "()[]:;!?";
|
||||
const std::string kFullSym = "()[]:;!?";
|
||||
EXPECT_EQ(kFullSym,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym));
|
||||
EXPECT_EQ(kFullSym, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym));
|
||||
}
|
||||
|
||||
TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertFullwidthLatin) {
|
||||
const std::string kFullAlpha = "ABCD";
|
||||
EXPECT_EQ(kFullAlpha,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha));
|
||||
EXPECT_EQ(kFullAlpha, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha));
|
||||
|
||||
const std::string kFullDigit = "0123";
|
||||
EXPECT_EQ(kFullDigit,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit));
|
||||
EXPECT_EQ(kFullDigit, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit));
|
||||
|
||||
const std::string kFullSym = "()[]:;!?";
|
||||
EXPECT_EQ(kFullSym,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym));
|
||||
EXPECT_EQ(kFullSym, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym));
|
||||
}
|
||||
|
||||
TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertNonLatin) {
|
||||
const std::string kHalfKana = "アイウエオ";
|
||||
const std::string kFullKana = "アイウエオ";
|
||||
EXPECT_EQ(kHalfKana,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana));
|
||||
EXPECT_EQ(kFullKana,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana));
|
||||
EXPECT_EQ(kHalfKana, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana));
|
||||
EXPECT_EQ(kFullKana, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana));
|
||||
}
|
||||
|
||||
TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) {
|
||||
const std::string kHalfSpace = " ";
|
||||
const std::string kFullSpace = " ";
|
||||
EXPECT_EQ(kHalfSpace,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace));
|
||||
EXPECT_EQ(kFullSpace,
|
||||
StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace));
|
||||
EXPECT_EQ(kHalfSpace, StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace));
|
||||
EXPECT_EQ(kFullSpace, StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace));
|
||||
}
|
||||
|
||||
// ------------ StringRenderer::ConvertFullwidthLatinToBasicLatin() ------------
|
||||
@ -516,49 +484,39 @@ TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) {
|
||||
TEST(ConvertFullwidthLatinToBasicLatinTest, DoesConvertFullwidthLatin) {
|
||||
const std::string kHalfAlpha = "ABCD";
|
||||
const std::string kFullAlpha = "ABCD";
|
||||
EXPECT_EQ(kHalfAlpha,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha));
|
||||
EXPECT_EQ(kHalfAlpha, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha));
|
||||
|
||||
const std::string kHalfDigit = "0123";
|
||||
const std::string kFullDigit = "0123";
|
||||
EXPECT_EQ(kHalfDigit,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit));
|
||||
EXPECT_EQ(kHalfDigit, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit));
|
||||
|
||||
const std::string kHalfSym = "()[]:;!?";
|
||||
const std::string kFullSym = "()[]:;!?";
|
||||
EXPECT_EQ(kHalfSym,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym));
|
||||
EXPECT_EQ(kHalfSym, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym));
|
||||
}
|
||||
|
||||
TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertBasicLatin) {
|
||||
const std::string kHalfAlpha = "ABCD";
|
||||
EXPECT_EQ(kHalfAlpha,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha));
|
||||
EXPECT_EQ(kHalfAlpha, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha));
|
||||
|
||||
const std::string kHalfDigit = "0123";
|
||||
EXPECT_EQ(kHalfDigit,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit));
|
||||
EXPECT_EQ(kHalfDigit, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit));
|
||||
|
||||
const std::string kHalfSym = "()[]:;!?";
|
||||
EXPECT_EQ(kHalfSym,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym));
|
||||
EXPECT_EQ(kHalfSym, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym));
|
||||
}
|
||||
|
||||
TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertNonLatin) {
|
||||
const std::string kHalfKana = "アイウエオ";
|
||||
const std::string kFullKana = "アイウエオ";
|
||||
EXPECT_EQ(kHalfKana,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana));
|
||||
EXPECT_EQ(kFullKana,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana));
|
||||
EXPECT_EQ(kHalfKana, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana));
|
||||
EXPECT_EQ(kFullKana, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana));
|
||||
}
|
||||
|
||||
TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertSpace) {
|
||||
const std::string kHalfSpace = " ";
|
||||
const std::string kFullSpace = " ";
|
||||
EXPECT_EQ(kHalfSpace,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace));
|
||||
EXPECT_EQ(kFullSpace,
|
||||
StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace));
|
||||
EXPECT_EQ(kHalfSpace, StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace));
|
||||
EXPECT_EQ(kFullSpace, StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace));
|
||||
}
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
||||
|
||||
#include "google/protobuf/util/message_differencer.h"
|
||||
|
||||
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
@ -32,29 +31,26 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
|
||||
|
||||
|
||||
using tensorflow::int8;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using std::unordered_map;
|
||||
using std::unordered_set;
|
||||
using std::vector;
|
||||
using tensorflow::int16;
|
||||
using tensorflow::int32;
|
||||
using tensorflow::int64;
|
||||
using tensorflow::uint8;
|
||||
using tensorflow::uint16;
|
||||
using tensorflow::uint64;
|
||||
using tensorflow::uint32;
|
||||
using tensorflow::protobuf::TextFormat;
|
||||
using tensorflow::mutex_lock;
|
||||
using tensorflow::int8;
|
||||
using tensorflow::mutex;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using std::vector;
|
||||
using std::unordered_map;
|
||||
using std::unordered_set;
|
||||
using tensorflow::mutex_lock;
|
||||
using tensorflow::uint16;
|
||||
using tensorflow::uint32;
|
||||
using tensorflow::uint64;
|
||||
using tensorflow::uint8;
|
||||
using tensorflow::protobuf::TextFormat;
|
||||
typedef signed int char32;
|
||||
|
||||
using tensorflow::StringPiece;
|
||||
using std::string;
|
||||
|
||||
using tensorflow::StringPiece;
|
||||
|
||||
// namespace syntaxnet
|
||||
|
||||
|
@ -39,8 +39,7 @@ class TestableTableFinder : public tesseract::TableFinder {
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (part->bounding_box().left() == box.left() &&
|
||||
part->bounding_box().bottom() == box.bottom() &&
|
||||
part->bounding_box().right() == box.right() &&
|
||||
part->bounding_box().top() == box.top()) {
|
||||
part->bounding_box().right() == box.right() && part->bounding_box().top() == box.top()) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
@ -72,7 +71,8 @@ class TableFinderTest : public testing::Test {
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
if (partition_.get() != nullptr) partition_->DeleteBoxes();
|
||||
if (partition_.get() != nullptr)
|
||||
partition_->DeleteBoxes();
|
||||
DeletePartitionListBoxes();
|
||||
finder_.reset(nullptr);
|
||||
}
|
||||
@ -81,13 +81,13 @@ class TableFinderTest : public testing::Test {
|
||||
MakePartition(x_min, y_min, x_max, y_max, 0, 0);
|
||||
}
|
||||
|
||||
void MakePartition(int x_min, int y_min, int x_max, int y_max,
|
||||
int first_column, int last_column) {
|
||||
if (partition_.get() != nullptr) partition_->DeleteBoxes();
|
||||
void MakePartition(int x_min, int y_min, int x_max, int y_max, int first_column,
|
||||
int last_column) {
|
||||
if (partition_.get() != nullptr)
|
||||
partition_->DeleteBoxes();
|
||||
TBOX box;
|
||||
box.set_to_given_coords(x_min, y_min, x_max, y_max);
|
||||
partition_.reset(
|
||||
ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE));
|
||||
partition_.reset(ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE));
|
||||
partition_->set_first_column(first_column);
|
||||
partition_->set_last_column(last_column);
|
||||
}
|
||||
@ -101,12 +101,12 @@ class TableFinderTest : public testing::Test {
|
||||
InsertLeaderPartition(x_min, y_min, x_max, y_max, 0, 0);
|
||||
}
|
||||
|
||||
void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max,
|
||||
int first_column, int last_column) {
|
||||
void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max, int first_column,
|
||||
int last_column) {
|
||||
TBOX box;
|
||||
box.set_to_given_coords(x_min, y_min, x_max, y_max);
|
||||
ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT,
|
||||
BRT_UNKNOWN, BTFT_LEADER);
|
||||
ColPartition *part =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_UNKNOWN, BTFT_LEADER);
|
||||
part->set_first_column(first_column);
|
||||
part->set_last_column(last_column);
|
||||
finder_->InsertLeaderPartition(part);
|
||||
@ -114,8 +114,7 @@ class TableFinderTest : public testing::Test {
|
||||
}
|
||||
|
||||
void DeletePartitionListBoxes() {
|
||||
for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list();
|
||||
free_boxes_it_.forward()) {
|
||||
for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list(); free_boxes_it_.forward()) {
|
||||
ColPartition *part = free_boxes_it_.data();
|
||||
part->DeleteBoxes();
|
||||
}
|
||||
@ -131,23 +130,30 @@ class TableFinderTest : public testing::Test {
|
||||
|
||||
TEST_F(TableFinderTest, GapInXProjectionNoGap) {
|
||||
int data[100];
|
||||
for (int i = 0; i < 100; ++i) data[i] = 10;
|
||||
for (int i = 0; i < 100; ++i)
|
||||
data[i] = 10;
|
||||
EXPECT_FALSE(finder_->GapInXProjection(data, 100));
|
||||
}
|
||||
|
||||
TEST_F(TableFinderTest, GapInXProjectionEdgeGap) {
|
||||
int data[100];
|
||||
for (int i = 0; i < 10; ++i) data[i] = 2;
|
||||
for (int i = 10; i < 90; ++i) data[i] = 10;
|
||||
for (int i = 90; i < 100; ++i) data[i] = 2;
|
||||
for (int i = 0; i < 10; ++i)
|
||||
data[i] = 2;
|
||||
for (int i = 10; i < 90; ++i)
|
||||
data[i] = 10;
|
||||
for (int i = 90; i < 100; ++i)
|
||||
data[i] = 2;
|
||||
EXPECT_FALSE(finder_->GapInXProjection(data, 100));
|
||||
}
|
||||
|
||||
TEST_F(TableFinderTest, GapInXProjectionExists) {
|
||||
int data[100];
|
||||
for (int i = 0; i < 10; ++i) data[i] = 10;
|
||||
for (int i = 10; i < 90; ++i) data[i] = 2;
|
||||
for (int i = 90; i < 100; ++i) data[i] = 10;
|
||||
for (int i = 0; i < 10; ++i)
|
||||
data[i] = 10;
|
||||
for (int i = 10; i < 90; ++i)
|
||||
data[i] = 2;
|
||||
for (int i = 90; i < 100; ++i)
|
||||
data[i] = 10;
|
||||
EXPECT_TRUE(finder_->GapInXProjection(data, 100));
|
||||
}
|
||||
|
||||
@ -258,4 +264,4 @@ TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) {
|
||||
finder_->ExpectPartitionCount(1);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -89,8 +89,7 @@ class SharedTest : public testing::Test {
|
||||
|
||||
void InsertPartition(int left, int bottom, int right, int top) {
|
||||
TBOX box(left, bottom, right, top);
|
||||
ColPartition* part =
|
||||
ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->set_median_width(3);
|
||||
part->set_median_height(3);
|
||||
text_grid_->InsertBBox(true, true, part);
|
||||
@ -100,30 +99,28 @@ class SharedTest : public testing::Test {
|
||||
}
|
||||
|
||||
void InsertLines() {
|
||||
line_box_.set_to_given_coords(
|
||||
100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(),
|
||||
line_box_.set_to_given_coords(100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(),
|
||||
450 + line_grid_->gridsize(), 50 + line_grid_->gridsize());
|
||||
for (int i = 10; i <= 50; i += 10) InsertHorizontalLine(100, 450, i);
|
||||
for (int i = 100; i <= 450; i += 50) InsertVerticalLine(i, 10, 50);
|
||||
for (int i = 10; i <= 50; i += 10)
|
||||
InsertHorizontalLine(100, 450, i);
|
||||
for (int i = 100; i <= 450; i += 50)
|
||||
InsertVerticalLine(i, 10, 50);
|
||||
|
||||
for (int i = 100; i <= 200; i += 20) InsertHorizontalLine(0, 100, i);
|
||||
for (int i = 100; i <= 200; i += 20)
|
||||
InsertHorizontalLine(0, 100, i);
|
||||
}
|
||||
|
||||
void InsertHorizontalLine(int left, int right, int y) {
|
||||
TBOX box(left, y - line_grid_->gridsize(), right,
|
||||
y + line_grid_->gridsize());
|
||||
ColPartition* part =
|
||||
ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE);
|
||||
TBOX box(left, y - line_grid_->gridsize(), right, y + line_grid_->gridsize());
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE);
|
||||
line_grid_->InsertBBox(true, true, part);
|
||||
|
||||
tesseract::ColPartition_IT add_it(&allocated_parts_);
|
||||
add_it.add_after_stay_put(part);
|
||||
}
|
||||
void InsertVerticalLine(int x, int bottom, int top) {
|
||||
TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(),
|
||||
top);
|
||||
ColPartition* part =
|
||||
ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE);
|
||||
TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(), top);
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE);
|
||||
line_grid_->InsertBBox(true, true, part);
|
||||
|
||||
tesseract::ColPartition_IT add_it(&allocated_parts_);
|
||||
@ -266,8 +263,10 @@ TEST_F(StructuredTableTest, CountHorizontalIntersectionsAll) {
|
||||
}
|
||||
|
||||
TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) {
|
||||
for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
|
||||
for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
|
||||
for (int y = 10; y <= 50; y += 10)
|
||||
table_->InjectCellY(y);
|
||||
for (int x = 100; x <= 450; x += 50)
|
||||
table_->InjectCellX(x);
|
||||
InsertLines();
|
||||
InsertCellsInLines();
|
||||
table_->set_bounding_box(line_box_);
|
||||
@ -275,8 +274,10 @@ TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) {
|
||||
}
|
||||
|
||||
TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) {
|
||||
for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
|
||||
for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
|
||||
for (int y = 10; y <= 50; y += 10)
|
||||
table_->InjectCellY(y);
|
||||
for (int x = 100; x <= 450; x += 50)
|
||||
table_->InjectCellX(x);
|
||||
InsertLines();
|
||||
InsertCellsInLines();
|
||||
InsertPartition(101, 11, 299, 19);
|
||||
@ -285,8 +286,10 @@ TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) {
|
||||
}
|
||||
|
||||
TEST_F(StructuredTableTest, VerifyLinedTableVerticalFail) {
|
||||
for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
|
||||
for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
|
||||
for (int y = 10; y <= 50; y += 10)
|
||||
table_->InjectCellY(y);
|
||||
for (int x = 100; x <= 450; x += 50)
|
||||
table_->InjectCellX(x);
|
||||
InsertLines();
|
||||
InsertCellsInLines();
|
||||
InsertPartition(151, 21, 199, 39);
|
||||
@ -313,4 +316,4 @@ TEST_F(StructuredTableTest, FindWhitespacedColumnsSorted) {
|
||||
// TODO(nbeato): check failure cases
|
||||
// TODO(nbeato): check Recognize processes correctly on trivial real examples.
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -127,4 +127,4 @@ TEST_F(TabVectorTest, XYFlip) {
|
||||
EXPECT_EQ(3, vector_->endpt().y());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -15,8 +15,8 @@
|
||||
# include <unistd.h> // for access
|
||||
#endif
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "dawg.h"
|
||||
#include "include_gunit.h"
|
||||
#include "trie.h"
|
||||
#include "unicharset.h"
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
@ -53,7 +53,8 @@ class TatweelTest : public ::testing::Test {
|
||||
int num_tatweel = 0;
|
||||
for (auto it = text.begin(); it != text.end(); ++it) {
|
||||
std::string utf8 = it.get_utf8_string();
|
||||
if (utf8.find(u8"\u0640") != std::string::npos) ++num_tatweel;
|
||||
if (utf8.find(u8"\u0640") != std::string::npos)
|
||||
++num_tatweel;
|
||||
unicharset_.unichar_insert(utf8.c_str());
|
||||
}
|
||||
LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
|
||||
@ -78,15 +79,13 @@ TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
|
||||
|
||||
TEST_F(TatweelTest, DictIgnoresTatweel) {
|
||||
// This test verifies that the dictionary ignores the Tatweel character.
|
||||
tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM,
|
||||
unicharset_.size(), 0);
|
||||
tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0);
|
||||
std::string filename = TestDataNameToPath("ara.wordlist");
|
||||
if (!file_exists(filename.c_str())) {
|
||||
LOG(INFO) << "Skip test because of missing " << filename;
|
||||
GTEST_SKIP();
|
||||
} else {
|
||||
EXPECT_TRUE(trie.read_and_add_word_list(
|
||||
filename.c_str(), unicharset_,
|
||||
EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_,
|
||||
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));
|
||||
EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
|
||||
}
|
||||
@ -104,11 +103,12 @@ TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
|
||||
int num_tatweel = 0;
|
||||
for (int i = 0; i < unicharset_.size(); ++i) {
|
||||
const char *utf8 = unicharset_.id_to_unichar(i);
|
||||
if (strstr(utf8, reinterpret_cast<const char*>(u8"\u0640")) != nullptr) ++num_tatweel;
|
||||
if (strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")) != nullptr)
|
||||
++num_tatweel;
|
||||
}
|
||||
LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
|
||||
EXPECT_EQ(num_tatweel, 4);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -16,10 +16,10 @@
|
||||
#include "include_gunit.h"
|
||||
|
||||
#include <tesseract/baseapi.h>
|
||||
#include <tesseract/osdetect.h>
|
||||
#include "colfind.h"
|
||||
#include "log.h" // for LOG
|
||||
#include "mutableiterator.h"
|
||||
#include <tesseract/osdetect.h>
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "textlineprojection.h"
|
||||
@ -72,14 +72,12 @@ class TextlineProjectionTest : public testing::Test {
|
||||
tesseract::TessdataManager mgr;
|
||||
Tesseract *osd_tess = new Tesseract;
|
||||
OSResults osr;
|
||||
EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, "", "osd",
|
||||
tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
|
||||
nullptr, nullptr, false, &mgr),
|
||||
EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, "", "osd", tesseract::OEM_TESSERACT_ONLY,
|
||||
nullptr, 0, nullptr, nullptr, false, &mgr),
|
||||
0);
|
||||
tesseract_ = new Tesseract;
|
||||
EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, "", "eng",
|
||||
tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
|
||||
nullptr, nullptr, false, &mgr),
|
||||
EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, "", "eng", tesseract::OEM_TESSERACT_ONLY,
|
||||
nullptr, 0, nullptr, nullptr, false, &mgr),
|
||||
0);
|
||||
bin_pix_ = api_.GetThresholdedImage();
|
||||
*tesseract_->mutable_pix_binary() = pixClone(bin_pix_);
|
||||
@ -97,17 +95,16 @@ class TextlineProjectionTest : public testing::Test {
|
||||
// The blocks made by the ColumnFinder. Moved to blocks before return.
|
||||
BLOCK_LIST found_blocks;
|
||||
TO_BLOCK_LIST temp_blocks;
|
||||
finder_ = tesseract_->SetupPageSegAndDetectOrientation(
|
||||
tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess, &osr, &temp_blocks,
|
||||
&photomask_pix, nullptr);
|
||||
finder_ =
|
||||
tesseract_->SetupPageSegAndDetectOrientation(tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess,
|
||||
&osr, &temp_blocks, &photomask_pix, nullptr);
|
||||
TO_BLOCK_IT to_block_it(&temp_blocks);
|
||||
TO_BLOCK *to_block = to_block_it.data();
|
||||
denorm_ = finder_->denorm();
|
||||
TO_BLOCK_LIST to_blocks;
|
||||
BLOBNBOX_LIST diacritic_blobs;
|
||||
EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block,
|
||||
photomask_pix, nullptr, nullptr, nullptr,
|
||||
&found_blocks, &diacritic_blobs, &to_blocks),
|
||||
EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block, photomask_pix, nullptr,
|
||||
nullptr, nullptr, &found_blocks, &diacritic_blobs, &to_blocks),
|
||||
0);
|
||||
projection_ = finder_->projection();
|
||||
pixDestroy(&photomask_pix);
|
||||
@ -116,19 +113,17 @@ class TextlineProjectionTest : public testing::Test {
|
||||
|
||||
// Helper evaluates the given box, expects the result to be greater_than
|
||||
// or !greater_than the target_value and provides diagnostics if not.
|
||||
void EvaluateBox(const TBOX& box, bool greater_or_equal, int target_value,
|
||||
const char* text, const char* message) {
|
||||
void EvaluateBox(const TBOX &box, bool greater_or_equal, int target_value, const char *text,
|
||||
const char *message) {
|
||||
int value = projection_->EvaluateBox(box, denorm_, false);
|
||||
if (greater_or_equal != (value > target_value)) {
|
||||
LOG(INFO) << absl::StrFormat(
|
||||
"EvaluateBox too %s:%d vs %d for %s word '%s' at:",
|
||||
greater_or_equal ? "low" : "high", value, target_value, message,
|
||||
text);
|
||||
"EvaluateBox too %s:%d vs %d for %s word '%s' at:", greater_or_equal ? "low" : "high",
|
||||
value, target_value, message, text);
|
||||
box.print();
|
||||
value = projection_->EvaluateBox(box, denorm_, true);
|
||||
} else {
|
||||
LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'",
|
||||
value, message, text);
|
||||
LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'", value, message, text);
|
||||
}
|
||||
if (greater_or_equal) {
|
||||
EXPECT_GE(value, target_value);
|
||||
@ -139,23 +134,19 @@ class TextlineProjectionTest : public testing::Test {
|
||||
|
||||
// Helper evaluates the DistanceOfBoxFromBox function by expecting that
|
||||
// box should be nearer to true_box than false_box.
|
||||
void EvaluateDistance(const TBOX& box, const TBOX& true_box,
|
||||
const TBOX& false_box, const char* text,
|
||||
const char* message) {
|
||||
int true_dist =
|
||||
projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false);
|
||||
int false_dist =
|
||||
projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false);
|
||||
void EvaluateDistance(const TBOX &box, const TBOX &true_box, const TBOX &false_box,
|
||||
const char *text, const char *message) {
|
||||
int true_dist = projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false);
|
||||
int false_dist = projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false);
|
||||
if (false_dist <= true_dist) {
|
||||
LOG(INFO) << absl::StrFormat(
|
||||
"Distance wrong:%d vs %d for %s word '%s' at:",
|
||||
false_dist, true_dist, message, text);
|
||||
LOG(INFO) << absl::StrFormat("Distance wrong:%d vs %d for %s word '%s' at:", false_dist,
|
||||
true_dist, message, text);
|
||||
true_box.print();
|
||||
projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, true);
|
||||
projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, true);
|
||||
} else {
|
||||
LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'",
|
||||
false_dist, true_dist, message, text);
|
||||
LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'", false_dist, true_dist,
|
||||
message, text);
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,7 +187,8 @@ class TextlineProjectionTest : public testing::Test {
|
||||
TBOX lower_box = word_box;
|
||||
lower_box.set_top(word_box.bottom());
|
||||
lower_box.set_bottom(word_box.bottom() - padding);
|
||||
if (tall_word) lower_box.move(ICOORD(0, padding / 2));
|
||||
if (tall_word)
|
||||
lower_box.move(ICOORD(0, padding / 2));
|
||||
EvaluateBox(lower_box, false, kMinStrongTextValue, text, "Lower Word");
|
||||
EvaluateBox(lower_box, true, -1, text, "Lower Word not vertical");
|
||||
|
||||
@ -225,17 +217,16 @@ class TextlineProjectionTest : public testing::Test {
|
||||
TBOX upper_challenger(upper_box);
|
||||
upper_challenger.set_bottom(upper_box.top());
|
||||
upper_challenger.set_top(upper_box.top() + word_box.height());
|
||||
EvaluateDistance(upper_box, target_box, upper_challenger, text,
|
||||
"Upper Word");
|
||||
if (tall_word) lower_box.move(ICOORD(0, padding / 2));
|
||||
EvaluateDistance(upper_box, target_box, upper_challenger, text, "Upper Word");
|
||||
if (tall_word)
|
||||
lower_box.move(ICOORD(0, padding / 2));
|
||||
lower_box.set_bottom(lower_box.top() - padding);
|
||||
target_box = word_box;
|
||||
target_box.set_bottom(lower_box.top());
|
||||
TBOX lower_challenger(lower_box);
|
||||
lower_challenger.set_top(lower_box.bottom());
|
||||
lower_challenger.set_bottom(lower_box.bottom() - word_box.height());
|
||||
EvaluateDistance(lower_box, target_box, lower_challenger, text,
|
||||
"Lower Word");
|
||||
EvaluateDistance(lower_box, target_box, lower_challenger, text, "Lower Word");
|
||||
|
||||
delete[] text;
|
||||
} while (it->Next(tesseract::RIL_WORD));
|
||||
@ -254,9 +245,13 @@ class TextlineProjectionTest : public testing::Test {
|
||||
};
|
||||
|
||||
// Tests all word boxes on an unrotated image.
|
||||
TEST_F(TextlineProjectionTest, Unrotated) { VerifyBoxes("phototest.tif", 31); }
|
||||
TEST_F(TextlineProjectionTest, Unrotated) {
|
||||
VerifyBoxes("phototest.tif", 31);
|
||||
}
|
||||
|
||||
// Tests character-level applyboxes on italic Times New Roman.
|
||||
TEST_F(TextlineProjectionTest, Rotated) { VerifyBoxes("phototestrot.tif", 31); }
|
||||
TEST_F(TextlineProjectionTest, Rotated) {
|
||||
VerifyBoxes("phototestrot.tif", 31);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -33,9 +33,11 @@ class TfileTest : public ::testing::Test {
|
||||
MathData() : num_squares_(0), num_triangles_(0) {}
|
||||
void Setup() {
|
||||
// Setup some data.
|
||||
for (int s = 0; s < 42; ++s) squares_.push_back(s * s);
|
||||
for (int s = 0; s < 42; ++s)
|
||||
squares_.push_back(s * s);
|
||||
num_squares_ = squares_.size();
|
||||
for (int t = 0; t < 52; ++t) triangles_.push_back(t * (t + 1) / 2);
|
||||
for (int t = 0; t < 52; ++t)
|
||||
triangles_.push_back(t * (t + 1) / 2);
|
||||
num_triangles_ = triangles_.size();
|
||||
}
|
||||
void ExpectEq(const MathData &other) {
|
||||
@ -48,31 +50,39 @@ class TfileTest : public ::testing::Test {
|
||||
EXPECT_EQ(triangles_[s], other.triangles_[s]);
|
||||
}
|
||||
bool Serialize(TFile *fp) {
|
||||
if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
|
||||
if (!squares_.Serialize(fp)) return false;
|
||||
if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1)
|
||||
return false;
|
||||
if (!squares_.Serialize(fp))
|
||||
return false;
|
||||
if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
|
||||
return false;
|
||||
if (!triangles_.Serialize(fp)) return false;
|
||||
if (!triangles_.Serialize(fp))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
bool DeSerialize(TFile *fp) {
|
||||
if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1)
|
||||
return false;
|
||||
if (!squares_.DeSerialize(fp)) return false;
|
||||
if (!squares_.DeSerialize(fp))
|
||||
return false;
|
||||
if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1)
|
||||
return false;
|
||||
if (!triangles_.DeSerialize(fp)) return false;
|
||||
if (!triangles_.DeSerialize(fp))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
bool SerializeBigEndian(TFile *fp) {
|
||||
ReverseN(&num_squares_, sizeof(num_squares_));
|
||||
if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
|
||||
if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1)
|
||||
return false;
|
||||
// Write an additional reversed size before the vector, which will get
|
||||
// used as its size on reading.
|
||||
if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
|
||||
if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1)
|
||||
return false;
|
||||
for (int i = 0; i < squares_.size(); ++i)
|
||||
ReverseN(&squares_[i], sizeof(squares_[i]));
|
||||
if (!squares_.Serialize(fp)) return false;
|
||||
if (!squares_.Serialize(fp))
|
||||
return false;
|
||||
ReverseN(&num_triangles_, sizeof(num_triangles_));
|
||||
if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
|
||||
return false;
|
||||
@ -85,7 +95,8 @@ class TfileTest : public ::testing::Test {
|
||||
bool DeSerializeBigEndian(TFile *fp) {
|
||||
if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1)
|
||||
return false;
|
||||
if (!squares_.DeSerialize(fp)) return false;
|
||||
if (!squares_.DeSerialize(fp))
|
||||
return false;
|
||||
// The first element is the size that was written, so we will delete it
|
||||
// and read the last element separately.
|
||||
int last_element;
|
||||
@ -95,7 +106,8 @@ class TfileTest : public ::testing::Test {
|
||||
squares_.push_back(last_element);
|
||||
if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1)
|
||||
return false;
|
||||
if (!triangles_.DeSerialize(fp)) return false;
|
||||
if (!triangles_.DeSerialize(fp))
|
||||
return false;
|
||||
if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1)
|
||||
return false;
|
||||
triangles_.remove(0);
|
||||
@ -176,4 +188,4 @@ TEST_F(TfileTest, BigEndian) {
|
||||
m3.ExpectEq(m2);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
32
unittest/third_party/utf/rune.c
vendored
32
unittest/third_party/utf/rune.c
vendored
@ -16,8 +16,7 @@
|
||||
#include "third_party/utf/utf.h"
|
||||
#include "third_party/utf/utfdef.h"
|
||||
|
||||
enum
|
||||
{
|
||||
enum {
|
||||
Bit1 = 7,
|
||||
Bitx = 6,
|
||||
Bit2 = 5,
|
||||
@ -62,9 +61,7 @@ enum
|
||||
* Note that if we have decoding problems for other
|
||||
* reasons, we return 1 instead of 0.
|
||||
*/
|
||||
int
|
||||
charntorune(Rune *rune, const char *str, int length)
|
||||
{
|
||||
int charntorune(Rune *rune, const char *str, int length) {
|
||||
int c, c1, c2, c3;
|
||||
long l;
|
||||
|
||||
@ -157,17 +154,13 @@ bad:
|
||||
badlen:
|
||||
*rune = Bad;
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This is the older "unsafe" version, which works fine on
|
||||
* null-terminated strings.
|
||||
*/
|
||||
int
|
||||
chartorune(Rune *rune, const char *str)
|
||||
{
|
||||
int chartorune(Rune *rune, const char *str) {
|
||||
int c, c1, c2, c3;
|
||||
long l;
|
||||
|
||||
@ -243,15 +236,12 @@ bad:
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
|
||||
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed) {
|
||||
*consumed = charntorune(rune, str, length);
|
||||
return *rune != Runeerror || *consumed == 3;
|
||||
}
|
||||
|
||||
int
|
||||
runetochar(char *str, const Rune *rune)
|
||||
{
|
||||
int runetochar(char *str, const Rune *rune) {
|
||||
/* Runes are signed, so convert to unsigned for range check. */
|
||||
unsigned long c;
|
||||
|
||||
@ -306,17 +296,13 @@ runetochar(char *str, const Rune *rune)
|
||||
return 4;
|
||||
}
|
||||
|
||||
int
|
||||
runelen(Rune rune)
|
||||
{
|
||||
int runelen(Rune rune) {
|
||||
char str[10];
|
||||
|
||||
return runetochar(str, &rune);
|
||||
}
|
||||
|
||||
int
|
||||
runenlen(const Rune *r, int nrune)
|
||||
{
|
||||
int runenlen(const Rune *r, int nrune) {
|
||||
int nb;
|
||||
ulong c; /* Rune is signed, so use unsigned for range check. */
|
||||
|
||||
@ -337,9 +323,7 @@ runenlen(const Rune *r, int nrune)
|
||||
return nb;
|
||||
}
|
||||
|
||||
int
|
||||
fullrune(const char *str, int n)
|
||||
{
|
||||
int fullrune(const char *str, int n) {
|
||||
if (n > 0) {
|
||||
int c = *(uchar *)str;
|
||||
if (c < Tx)
|
||||
|
22
unittest/third_party/utf/utf.h
vendored
22
unittest/third_party/utf/utf.h
vendored
@ -18,8 +18,7 @@
|
||||
|
||||
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
||||
|
||||
enum
|
||||
{
|
||||
enum {
|
||||
UTFmax = 4, /* maximum bytes per rune */
|
||||
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||
@ -48,7 +47,6 @@ extern "C" {
|
||||
|
||||
int runetochar(char *s, const Rune *r);
|
||||
|
||||
|
||||
// chartorune copies (decodes) at most UTFmax bytes starting at s to
|
||||
// one rune, pointed to by r, and returns the number of bytes consumed.
|
||||
// If the input is not exactly in UTF format, chartorune will set *r
|
||||
@ -61,7 +59,6 @@ int runetochar(char* s, const Rune* r);
|
||||
|
||||
int chartorune(Rune *r, const char *s);
|
||||
|
||||
|
||||
// charntorune is like chartorune, except that it will access at most
|
||||
// n bytes of s. If the UTF sequence is incomplete within n bytes,
|
||||
// charntorune will set *r to Runeerror and return 0. If it is complete
|
||||
@ -82,13 +79,11 @@ int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
|
||||
|
||||
int runelen(Rune r);
|
||||
|
||||
|
||||
// runenlen returns the number of bytes required to convert the n
|
||||
// runes pointed to by r into UTF.
|
||||
|
||||
int runenlen(const Rune *r, int n);
|
||||
|
||||
|
||||
// fullrune returns 1 if the string s of length n is long enough to be
|
||||
// decoded by chartorune, and 0 otherwise. This does not guarantee
|
||||
// that the string contains a legal UTF encoding. This routine is used
|
||||
@ -106,7 +101,6 @@ int fullrune(const char* s, int n);
|
||||
|
||||
int utflen(const char *s);
|
||||
|
||||
|
||||
// utfnlen returns the number of complete runes that are represented
|
||||
// by the first n bytes of the UTF string s. If the last few bytes of
|
||||
// the string contain an incompletely coded rune, utfnlen will not
|
||||
@ -115,7 +109,6 @@ int utflen(const char* s);
|
||||
|
||||
int utfnlen(const char *s, long n);
|
||||
|
||||
|
||||
// utfrune returns a pointer to the first occurrence of rune r in the
|
||||
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||
// byte terminating a string is considered to be part of the string s.
|
||||
@ -123,7 +116,6 @@ int utfnlen(const char* s, long n);
|
||||
|
||||
const char *utfrune(const char *s, Rune r);
|
||||
|
||||
|
||||
// utfrrune returns a pointer to the last occurrence of rune r in the
|
||||
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||
// byte terminating a string is considered to be part of the string s.
|
||||
@ -131,14 +123,12 @@ const char* utfrune(const char* s, Rune r);
|
||||
|
||||
const char *utfrrune(const char *s, Rune r);
|
||||
|
||||
|
||||
// utfutf returns a pointer to the first occurrence of the UTF string
|
||||
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
|
||||
// null string, utfutf returns s1. (cf. strstr)
|
||||
|
||||
const char *utfutf(const char *s1, const char *s2);
|
||||
|
||||
|
||||
// utfecpy copies UTF sequences until a null sequence has been copied,
|
||||
// but writes no sequences beyond es1. If any sequences are copied,
|
||||
// s1 is terminated by a null sequence, and a pointer to that sequence
|
||||
@ -146,8 +136,6 @@ const char* utfutf(const char* s1, const char* s2);
|
||||
|
||||
char *utfecpy(char *s1, char *es1, const char *s2);
|
||||
|
||||
|
||||
|
||||
// These functions are rune-string analogues of the corresponding
|
||||
// functions in strcat (3).
|
||||
//
|
||||
@ -177,8 +165,6 @@ const Rune* runestrrchr(const Rune* s, Rune c);
|
||||
long runestrlen(const Rune *s);
|
||||
const Rune *runestrstr(const Rune *s1, const Rune *s2);
|
||||
|
||||
|
||||
|
||||
// The following routines test types and modify cases for Unicode
|
||||
// characters. Unicode defines some characters as letters and
|
||||
// specifies three cases: upper, lower, and title. Mappings among the
|
||||
@ -200,7 +186,6 @@ Rune toupperrune(Rune r);
|
||||
Rune tolowerrune(Rune r);
|
||||
Rune totitlerune(Rune r);
|
||||
|
||||
|
||||
// isupperrune tests for upper case characters, including Unicode
|
||||
// upper case letters and targets of the toupper mapping. islowerrune
|
||||
// and istitlerune are defined analogously.
|
||||
@ -209,32 +194,27 @@ int isupperrune(Rune r);
|
||||
int islowerrune(Rune r);
|
||||
int istitlerune(Rune r);
|
||||
|
||||
|
||||
// isalpharune tests for Unicode letters; this includes ideographs in
|
||||
// addition to alphabetic characters.
|
||||
|
||||
int isalpharune(Rune r);
|
||||
|
||||
|
||||
// isdigitrune tests for digits. Non-digit numbers, such as Roman
|
||||
// numerals, are not included.
|
||||
|
||||
int isdigitrune(Rune r);
|
||||
|
||||
|
||||
// isideographicrune tests for ideographic characters and numbers, as
|
||||
// defined by the Unicode standard.
|
||||
|
||||
int isideographicrune(Rune r);
|
||||
|
||||
|
||||
// isspacerune tests for whitespace characters, including "C" locale
|
||||
// whitespace, Unicode defined whitespace, and the "zero-width
|
||||
// non-break space" character.
|
||||
|
||||
int isspacerune(Rune r);
|
||||
|
||||
|
||||
// (The comments in this file were copied from the manpage files rune.3,
|
||||
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
|
||||
// to conform to Google style. /JRM 11/11/05)
|
||||
|
@ -9,9 +9,9 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "gmock/gmock.h" // for testing::ElementsAreArray
|
||||
#include <tesseract/unichar.h>
|
||||
#include "gmock/gmock.h" // for testing::ElementsAreArray
|
||||
#include "include_gunit.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -40,4 +40,4 @@ TEST(UnicharTest, InvalidText) {
|
||||
EXPECT_TRUE(utf8.empty());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -11,10 +11,10 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
#include <allheaders.h>
|
||||
#include "absl/strings/ascii.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_split.h"
|
||||
#include <allheaders.h>
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "log.h" // for LOG
|
||||
@ -33,24 +33,20 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
|
||||
// Loads and compresses the given unicharset.
|
||||
void LoadUnicharset(const std::string &unicharset_name) {
|
||||
std::string radical_stroke_file =
|
||||
file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
|
||||
std::string unicharset_file =
|
||||
file::JoinPath(TESTDATA_DIR, unicharset_name);
|
||||
std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
|
||||
std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
|
||||
std::string radical_data;
|
||||
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
|
||||
file::Defaults()));
|
||||
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
|
||||
CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
|
||||
STRING radical_str(radical_data.c_str());
|
||||
null_char_ =
|
||||
unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();
|
||||
null_char_ = unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();
|
||||
compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
|
||||
// Get the encoding of the null char.
|
||||
RecodedCharID code;
|
||||
compressed_.EncodeUnichar(null_char_, &code);
|
||||
encoded_null_char_ = code(0);
|
||||
std::string output_name = file::JoinPath(
|
||||
FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
|
||||
std::string output_name =
|
||||
file::JoinPath(FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
|
||||
STRING encoding = compressed_.GetEncodingAsString(unicharset_);
|
||||
std::string encoding_str(&encoding[0], encoding.size());
|
||||
CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
|
||||
@ -68,16 +64,14 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
}
|
||||
// Returns true if the lang is in CJK.
|
||||
bool IsCJKLang(const std::string &lang) {
|
||||
return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
|
||||
lang == "jpn";
|
||||
return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn";
|
||||
}
|
||||
// Returns true if the lang is Indic.
|
||||
bool IsIndicLang(const std::string &lang) {
|
||||
return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
|
||||
lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
|
||||
lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
|
||||
lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" ||
|
||||
lang == "tel";
|
||||
return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" ||
|
||||
lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" ||
|
||||
lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" ||
|
||||
lang == "tam" || lang == "tel";
|
||||
}
|
||||
|
||||
// Expects the appropriate results from the compressed_ unicharset_.
|
||||
@ -85,13 +79,14 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
// Count the number of times each code is used in each element of
|
||||
// RecodedCharID.
|
||||
RecodedCharID zeros;
|
||||
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
|
||||
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i)
|
||||
zeros.Set(i, 0);
|
||||
int code_range = compressed_.code_range();
|
||||
std::vector<RecodedCharID> times_seen(code_range, zeros);
|
||||
for (int u = 0; u <= unicharset_.size(); ++u) {
|
||||
if (u != UNICHAR_SPACE && u != null_char_ &&
|
||||
(u == unicharset_.size() || (unicharset_.has_special_codes() &&
|
||||
u < SPECIAL_UNICHAR_CODES_COUNT))) {
|
||||
(u == unicharset_.size() ||
|
||||
(unicharset_.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT))) {
|
||||
continue; // Not used so not encoded.
|
||||
}
|
||||
RecodedCharID code;
|
||||
@ -117,7 +112,8 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
for (int c = 0; c < code_range; ++c) {
|
||||
int num_used = 0;
|
||||
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
|
||||
if (times_seen[c](i) != 0) ++num_used;
|
||||
if (times_seen[c](i) != 0)
|
||||
++num_used;
|
||||
}
|
||||
EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
|
||||
}
|
||||
@ -133,8 +129,7 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
} else {
|
||||
EXPECT_LE(code_range, unicharset_.size() + 1);
|
||||
}
|
||||
LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to "
|
||||
<< code_range;
|
||||
LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range;
|
||||
}
|
||||
// Checks for extensions of the current code that either finish a code, or
|
||||
// extend it and checks those extensions recursively.
|
||||
@ -238,8 +233,7 @@ TEST_F(UnicharcompressTest, GetEncodingAsString) {
|
||||
ExpectCorrect("trivial");
|
||||
STRING encoding = compressed_.GetEncodingAsString(unicharset_);
|
||||
std::string encoding_str(&encoding[0], encoding.length());
|
||||
std::vector<std::string> lines =
|
||||
absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
|
||||
std::vector<std::string> lines = absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
|
||||
EXPECT_EQ(5, lines.size());
|
||||
// The first line is always space.
|
||||
EXPECT_EQ("0\t ", lines[0]);
|
||||
|
@ -9,11 +9,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string>
|
||||
#include "log.h" // for LOG
|
||||
#include "unicharset.h"
|
||||
#include <string>
|
||||
#include "gmock/gmock.h" // for testing::ElementsAreArray
|
||||
#include "include_gunit.h"
|
||||
#include "log.h" // for LOG
|
||||
|
||||
using testing::ElementsAreArray;
|
||||
|
||||
@ -55,11 +55,9 @@ TEST(UnicharsetTest, Basics) {
|
||||
EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
|
||||
// With the fi ligature encoding fails without a pre-cleanup.
|
||||
std::string lig_str = "af\ufb01ne";
|
||||
EXPECT_FALSE(
|
||||
u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
|
||||
EXPECT_FALSE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
|
||||
lig_str = u.CleanupString(lig_str.c_str());
|
||||
EXPECT_TRUE(
|
||||
u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
|
||||
EXPECT_TRUE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
|
||||
v = std::vector<int>(&labels[0], &labels[0] + labels.size());
|
||||
EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
|
||||
}
|
||||
@ -94,8 +92,8 @@ TEST(UnicharsetTest, Multibyte) {
|
||||
// The fi ligature is findable.
|
||||
EXPECT_EQ(u.unichar_to_id("\ufb01"), 6);
|
||||
std::vector<int> labels;
|
||||
EXPECT_TRUE(u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true,
|
||||
&labels, nullptr, nullptr));
|
||||
EXPECT_TRUE(
|
||||
u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true, &labels, nullptr, nullptr));
|
||||
std::vector<int> v(&labels[0], &labels[0] + labels.size());
|
||||
EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));
|
||||
// With the fi ligature the fi is picked out.
|
||||
@ -104,8 +102,7 @@ TEST(UnicharsetTest, Multibyte) {
|
||||
std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
|
||||
// src_str has to be pre-cleaned for lengths to be correct.
|
||||
std::string cleaned = u.CleanupString(src_str.c_str());
|
||||
EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths,
|
||||
&encoded_length));
|
||||
EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, &encoded_length));
|
||||
EXPECT_EQ(encoded_length, cleaned.size());
|
||||
std::string len_str(&lengths[0], lengths.size());
|
||||
EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
|
||||
@ -150,12 +147,11 @@ TEST(UnicharsetTest, MultibyteBigrams) {
|
||||
TEST(UnicharsetTest, OldStyle) {
|
||||
// This test verifies an old unicharset that contains fi/fl ligatures loads
|
||||
// and keeps all the entries.
|
||||
std::string filename =
|
||||
file::JoinPath(TESTDATA_DIR, "eng.unicharset");
|
||||
std::string filename = file::JoinPath(TESTDATA_DIR, "eng.unicharset");
|
||||
UNICHARSET u;
|
||||
LOG(INFO) << "Filename=" << filename;
|
||||
EXPECT_TRUE(u.load_from_file(filename.c_str()));
|
||||
EXPECT_EQ(u.size(), 111);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tesseract
|
||||
|
@ -40,8 +40,7 @@ static int CodepointCount(const char* utf8, int len) {
|
||||
return CodepointDistance(utf8, utf8 + len);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator::difference_type
|
||||
distance(const UnicodeText::const_iterator& first,
|
||||
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first,
|
||||
const UnicodeText::const_iterator &last) {
|
||||
return CodepointDistance(first.it_, last.it_);
|
||||
}
|
||||
@ -92,7 +91,6 @@ static int ConvertToInterchangeValid(char* start, int len) {
|
||||
return out - in;
|
||||
}
|
||||
|
||||
|
||||
// *************** Data representation **********
|
||||
|
||||
// Note: the copy constructor is undefined.
|
||||
@ -101,7 +99,8 @@ static int ConvertToInterchangeValid(char* start, int len) {
|
||||
|
||||
void UnicodeText::Repr::reserve(int new_capacity) {
|
||||
// If there's already enough capacity, and we're an owner, do nothing.
|
||||
if (capacity_ >= new_capacity && ours_) return;
|
||||
if (capacity_ >= new_capacity && ours_)
|
||||
return;
|
||||
|
||||
// Otherwise, allocate a new buffer.
|
||||
capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
|
||||
@ -110,7 +109,8 @@ void UnicodeText::Repr::reserve(int new_capacity) {
|
||||
// If there is an old buffer, copy it into the new buffer.
|
||||
if (data_) {
|
||||
memcpy(new_data, data_, size_);
|
||||
if (ours_) delete[] data_; // If we owned the old buffer, free it.
|
||||
if (ours_)
|
||||
delete[] data_; // If we owned the old buffer, free it.
|
||||
}
|
||||
data_ = new_data;
|
||||
ours_ = true; // We own the new buffer.
|
||||
@ -121,9 +121,11 @@ void UnicodeText::Repr::resize(int new_size) {
|
||||
if (new_size == 0) {
|
||||
clear();
|
||||
} else {
|
||||
if (!ours_ || new_size > capacity_) reserve(new_size);
|
||||
if (!ours_ || new_size > capacity_)
|
||||
reserve(new_size);
|
||||
// Clear the memory in the expanded part.
|
||||
if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
|
||||
if (size_ < new_size)
|
||||
memset(data_ + size_, 0, new_size - size_);
|
||||
size_ = new_size;
|
||||
ours_ = true;
|
||||
}
|
||||
@ -132,7 +134,8 @@ void UnicodeText::Repr::resize(int new_size) {
|
||||
// This implementation of clear() deallocates the buffer if we're an owner.
|
||||
// That's not strictly necessary; we could just set size_ to 0.
|
||||
void UnicodeText::Repr::clear() {
|
||||
if (ours_) delete[] data_;
|
||||
if (ours_)
|
||||
delete[] data_;
|
||||
data_ = nullptr;
|
||||
size_ = capacity_ = 0;
|
||||
ours_ = true;
|
||||
@ -144,8 +147,10 @@ void UnicodeText::Repr::Copy(const char* data, int size) {
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {
|
||||
if (data == data_) return; // We already own this memory. (Weird case.)
|
||||
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
|
||||
if (data == data_)
|
||||
return; // We already own this memory. (Weird case.)
|
||||
if (ours_ && data_)
|
||||
delete[] data_; // If we owned the old buffer, free it.
|
||||
data_ = data;
|
||||
size_ = size;
|
||||
capacity_ = capacity;
|
||||
@ -153,7 +158,8 @@ void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::PointTo(const char *data, int size) {
|
||||
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
|
||||
if (ours_ && data_)
|
||||
delete[] data_; // If we owned the old buffer, free it.
|
||||
data_ = const_cast<char *>(data);
|
||||
size_ = size;
|
||||
capacity_ = size;
|
||||
@ -167,21 +173,16 @@ void UnicodeText::Repr::append(const char* bytes, int byte_length) {
|
||||
}
|
||||
|
||||
string UnicodeText::Repr::DebugString() const {
|
||||
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
|
||||
this,
|
||||
data_, size_, capacity_,
|
||||
ours_ ? "Owned" : "Alias");
|
||||
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
|
||||
capacity_, ours_ ? "Owned" : "Alias");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// *************** UnicodeText ******************
|
||||
|
||||
// ----- Constructors -----
|
||||
|
||||
// Default constructor
|
||||
UnicodeText::UnicodeText() {
|
||||
}
|
||||
UnicodeText::UnicodeText() {}
|
||||
|
||||
// Copy constructor
|
||||
UnicodeText::UnicodeText(const UnicodeText &src) {
|
||||
@ -195,13 +196,11 @@ UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
|
||||
repr_.append(first.it_, last.it_ - first.it_);
|
||||
}
|
||||
|
||||
string UnicodeText::UTF8Substring(const const_iterator& first,
|
||||
const const_iterator& last) {
|
||||
string UnicodeText::UTF8Substring(const const_iterator &first, const const_iterator &last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
return string(first.it_, last.it_ - first.it_);
|
||||
}
|
||||
|
||||
|
||||
// ----- Copy -----
|
||||
|
||||
UnicodeText &UnicodeText::operator=(const UnicodeText &src) {
|
||||
@ -225,17 +224,14 @@ UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
|
||||
int byte_length) {
|
||||
UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {
|
||||
repr_.Copy(buffer, byte_length);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ----- TakeOwnershipOf -----
|
||||
|
||||
UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
|
||||
int byte_length,
|
||||
int byte_capacity) {
|
||||
UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {
|
||||
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
|
||||
if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
|
||||
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
|
||||
@ -244,8 +240,7 @@ UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
|
||||
int byte_length,
|
||||
UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,
|
||||
int byte_capacity) {
|
||||
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
|
||||
return *this;
|
||||
@ -264,8 +259,7 @@ UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
|
||||
int byte_length) {
|
||||
UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {
|
||||
repr_.PointTo(buffer, byte_length);
|
||||
return *this;
|
||||
}
|
||||
@ -275,8 +269,7 @@ UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::PointTo(const const_iterator &first,
|
||||
const const_iterator &last) {
|
||||
UnicodeText &UnicodeText::PointTo(const const_iterator &first, const const_iterator &last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
|
||||
return *this;
|
||||
@ -289,8 +282,7 @@ UnicodeText& UnicodeText::append(const UnicodeText& u) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::append(const const_iterator& first,
|
||||
const const_iterator& last) {
|
||||
UnicodeText &UnicodeText::append(const const_iterator &first, const const_iterator &last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
repr_.append(first.it_, last.it_ - first.it_);
|
||||
return *this;
|
||||
@ -314,8 +306,8 @@ UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
|
||||
return UnsafeFind(look, begin());
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::UnsafeFind(
|
||||
const UnicodeText& look, const_iterator start_pos) const {
|
||||
UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
|
||||
const_iterator start_pos) const {
|
||||
// Due to the magic of the UTF8 encoding, searching for a sequence of
|
||||
// letters is equivalent to substring search.
|
||||
StringPiece searching(utf8_data(), utf8_length());
|
||||
@ -324,7 +316,8 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(
|
||||
// StringPiece::size_type found =
|
||||
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
|
||||
StringPiece::size_type found = StringPiece::npos;
|
||||
if (found == StringPiece::npos) return end();
|
||||
if (found == StringPiece::npos)
|
||||
return end();
|
||||
return const_iterator(utf8_data() + found);
|
||||
}
|
||||
|
||||
@ -350,7 +343,6 @@ void UnicodeText::clear() {
|
||||
// Destructor
|
||||
UnicodeText::~UnicodeText() {}
|
||||
|
||||
|
||||
void UnicodeText::push_back(char32 c) {
|
||||
if (UniLib::IsValidCodepoint(c)) {
|
||||
char buf[UTFmax];
|
||||
@ -358,8 +350,7 @@ void UnicodeText::push_back(char32 c) {
|
||||
if (UniLib::IsInterchangeValid(buf, len)) {
|
||||
repr_.append(buf, len);
|
||||
} else {
|
||||
LOG(WARNING) << "Unicode value 0x" << std::hex << c
|
||||
<< " is not valid for interchange";
|
||||
LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
|
||||
repr_.append(" ", 1);
|
||||
}
|
||||
} else {
|
||||
@ -373,19 +364,18 @@ int UnicodeText::size() const {
|
||||
}
|
||||
|
||||
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
|
||||
if (&lhs == &rhs) return true;
|
||||
if (lhs.repr_.size_ != rhs.repr_.size_) return false;
|
||||
if (&lhs == &rhs)
|
||||
return true;
|
||||
if (lhs.repr_.size_ != rhs.repr_.size_)
|
||||
return false;
|
||||
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
|
||||
}
|
||||
|
||||
string UnicodeText::DebugString() const {
|
||||
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
|
||||
this,
|
||||
size(),
|
||||
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
|
||||
repr_.DebugString().c_str());
|
||||
}
|
||||
|
||||
|
||||
// ******************* UnicodeText::const_iterator *********************
|
||||
|
||||
// The implementation of const_iterator would be nicer if it
|
||||
@ -394,12 +384,9 @@ string UnicodeText::DebugString() const {
|
||||
|
||||
UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
|
||||
|
||||
UnicodeText::const_iterator::const_iterator(const const_iterator& other)
|
||||
: it_(other.it_) {
|
||||
}
|
||||
UnicodeText::const_iterator::const_iterator(const const_iterator &other) : it_(other.it_) {}
|
||||
|
||||
UnicodeText::const_iterator&
|
||||
UnicodeText::const_iterator::operator=(const const_iterator& other) {
|
||||
UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(const const_iterator &other) {
|
||||
if (&other != this)
|
||||
it_ = other.it_;
|
||||
return *this;
|
||||
@ -413,8 +400,7 @@ UnicodeText::const_iterator UnicodeText::end() const {
|
||||
return const_iterator(repr_.data_ + repr_.size_);
|
||||
}
|
||||
|
||||
bool operator<(const UnicodeText::const_iterator& lhs,
|
||||
const UnicodeText::const_iterator& rhs) {
|
||||
bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs) {
|
||||
return lhs.it_ < rhs.it_;
|
||||
}
|
||||
|
||||
@ -431,20 +417,14 @@ char32 UnicodeText::const_iterator::operator*() const {
|
||||
|
||||
unsigned char byte2 = it_[1];
|
||||
if (byte1 < 0xE0)
|
||||
return ((byte1 & 0x1F) << 6)
|
||||
| (byte2 & 0x3F);
|
||||
return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
|
||||
|
||||
unsigned char byte3 = it_[2];
|
||||
if (byte1 < 0xF0)
|
||||
return ((byte1 & 0x0F) << 12)
|
||||
| ((byte2 & 0x3F) << 6)
|
||||
| (byte3 & 0x3F);
|
||||
return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
|
||||
|
||||
unsigned char byte4 = it_[3];
|
||||
return ((byte1 & 0x07) << 18)
|
||||
| ((byte2 & 0x3F) << 12)
|
||||
| ((byte3 & 0x3F) << 6)
|
||||
| (byte4 & 0x3F);
|
||||
return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
|
||||
@ -453,14 +433,21 @@ UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator &UnicodeText::const_iterator::operator--() {
|
||||
while (UniLib::IsTrailByte(*--it_));
|
||||
while (UniLib::IsTrailByte(*--it_))
|
||||
;
|
||||
return *this;
|
||||
}
|
||||
|
||||
int UnicodeText::const_iterator::get_utf8(char *utf8_output) const {
|
||||
utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
|
||||
utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
|
||||
utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
|
||||
utf8_output[0] = it_[0];
|
||||
if ((it_[0] & 0xff) < 0x80)
|
||||
return 1;
|
||||
utf8_output[1] = it_[1];
|
||||
if ((it_[0] & 0xff) < 0xE0)
|
||||
return 2;
|
||||
utf8_output[2] = it_[2];
|
||||
if ((it_[0] & 0xff) < 0xF0)
|
||||
return 3;
|
||||
utf8_output[3] = it_[3];
|
||||
return 4;
|
||||
}
|
||||
@ -496,12 +483,12 @@ string UnicodeText::const_iterator::DebugString() const {
|
||||
return tensorflow::strings::Printf("{iter %p}", it_);
|
||||
}
|
||||
|
||||
|
||||
// *************************** Utilities *************************
|
||||
|
||||
string CodepointString(const UnicodeText &t) {
|
||||
string s;
|
||||
UnicodeText::const_iterator it = t.begin(), end = t.end();
|
||||
while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
|
||||
while (it != end)
|
||||
tensorflow::strings::Appendf(&s, "%X ", *it++);
|
||||
return s;
|
||||
}
|
||||
|
@ -131,18 +131,21 @@ class UnicodeText {
|
||||
|
||||
// x.Copy(y) copies the data from y into x.
|
||||
UnicodeText &Copy(const UnicodeText &src);
|
||||
inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
|
||||
inline UnicodeText &assign(const UnicodeText &src) {
|
||||
return Copy(src);
|
||||
}
|
||||
|
||||
// x.PointTo(y) changes x so that it points to y's data.
|
||||
// It does not copy y or take ownership of y's data.
|
||||
UnicodeText &PointTo(const UnicodeText &src);
|
||||
UnicodeText& PointTo(const const_iterator& first,
|
||||
const const_iterator& last);
|
||||
UnicodeText &PointTo(const const_iterator &first, const const_iterator &last);
|
||||
|
||||
~UnicodeText();
|
||||
|
||||
void clear(); // Clear text.
|
||||
bool empty() const { return repr_.size_ == 0; } // Test if text is empty.
|
||||
bool empty() const {
|
||||
return repr_.size_ == 0;
|
||||
} // Test if text is empty.
|
||||
|
||||
// Add a codepoint to the end of the text.
|
||||
// If the codepoint is not interchange-valid, add a space instead
|
||||
@ -158,7 +161,9 @@ class UnicodeText {
|
||||
// utext.append(more_chars.begin(), more_chars.end());
|
||||
template <typename ForwardIterator>
|
||||
UnicodeText &append(ForwardIterator first, const ForwardIterator last) {
|
||||
while (first != last) { push_back(*first++); }
|
||||
while (first != last) {
|
||||
push_back(*first++);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -175,6 +180,7 @@ class UnicodeText {
|
||||
|
||||
class const_iterator {
|
||||
typedef const_iterator CI;
|
||||
|
||||
public:
|
||||
typedef std::bidirectional_iterator_tag iterator_category;
|
||||
typedef char32 value_type;
|
||||
@ -207,16 +213,21 @@ class UnicodeText {
|
||||
|
||||
// We love relational operators.
|
||||
friend bool operator==(const CI &lhs, const CI &rhs) {
|
||||
return lhs.it_ == rhs.it_; }
|
||||
return lhs.it_ == rhs.it_;
|
||||
}
|
||||
friend bool operator!=(const CI &lhs, const CI &rhs) {
|
||||
return !(lhs == rhs); }
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
friend bool operator<(const CI &lhs, const CI &rhs);
|
||||
friend bool operator>(const CI &lhs, const CI &rhs) {
|
||||
return rhs < lhs; }
|
||||
return rhs < lhs;
|
||||
}
|
||||
friend bool operator<=(const CI &lhs, const CI &rhs) {
|
||||
return !(rhs < lhs); }
|
||||
return !(rhs < lhs);
|
||||
}
|
||||
friend bool operator>=(const CI &lhs, const CI &rhs) {
|
||||
return !(lhs < rhs); }
|
||||
return !(lhs < rhs);
|
||||
}
|
||||
|
||||
friend difference_type distance(const CI &first, const CI &last);
|
||||
|
||||
@ -230,7 +241,9 @@ class UnicodeText {
|
||||
// Return the byte length of the UTF-8 character the iterator points to.
|
||||
int utf8_length() const;
|
||||
// Return the iterator's pointer into the UTF-8 data.
|
||||
const char* utf8_data() const { return it_; }
|
||||
const char *utf8_data() const {
|
||||
return it_;
|
||||
}
|
||||
|
||||
string DebugString() const;
|
||||
|
||||
@ -248,8 +261,8 @@ class UnicodeText {
|
||||
|
||||
class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
|
||||
public:
|
||||
explicit const_reverse_iterator(const_iterator it) :
|
||||
std::reverse_iterator<const_iterator>(it) {}
|
||||
explicit const_reverse_iterator(const_iterator it)
|
||||
: std::reverse_iterator<const_iterator>(it) {}
|
||||
const char *utf8_data() const {
|
||||
const_iterator tmp_it = base();
|
||||
return (--tmp_it).utf8_data();
|
||||
@ -289,13 +302,18 @@ class UnicodeText {
|
||||
//
|
||||
// Return the data, length, and capacity of UTF-8-encoded version of
|
||||
// the text. Length and capacity are measured in bytes.
|
||||
const char* utf8_data() const { return repr_.data_; }
|
||||
int utf8_length() const { return repr_.size_; }
|
||||
int utf8_capacity() const { return repr_.capacity_; }
|
||||
const char *utf8_data() const {
|
||||
return repr_.data_;
|
||||
}
|
||||
int utf8_length() const {
|
||||
return repr_.size_;
|
||||
}
|
||||
int utf8_capacity() const {
|
||||
return repr_.capacity_;
|
||||
}
|
||||
|
||||
// Return the UTF-8 data as a string.
|
||||
static string UTF8Substring(const const_iterator& first,
|
||||
const const_iterator& last);
|
||||
static string UTF8Substring(const const_iterator &first, const const_iterator &last);
|
||||
|
||||
// There are three methods for initializing a UnicodeText from UTF-8
|
||||
// data. They vary in details of memory management. In all cases,
|
||||
@ -309,9 +327,7 @@ class UnicodeText {
|
||||
|
||||
// x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
|
||||
// buf. buf is not copied.
|
||||
UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
|
||||
int byte_length,
|
||||
int byte_capacity);
|
||||
UnicodeText &TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
|
||||
|
||||
// x.PointToUTF8(buf,len) changes x so that it points to buf
|
||||
// ("becomes an alias"). It does not take ownership or copy buf.
|
||||
@ -340,7 +356,10 @@ class UnicodeText {
|
||||
bool ours_; // Do we own data_?
|
||||
|
||||
Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
|
||||
~Repr() { if (ours_) delete[] data_; }
|
||||
~Repr() {
|
||||
if (ours_)
|
||||
delete[] data_;
|
||||
}
|
||||
|
||||
void clear();
|
||||
void reserve(int capacity);
|
||||
@ -367,12 +386,10 @@ class UnicodeText {
|
||||
// is not interchange-valid.
|
||||
//
|
||||
UnicodeText &UnsafeCopyUTF8(const char *utf8_buffer, int byte_length);
|
||||
UnicodeText& UnsafeTakeOwnershipOfUTF8(
|
||||
char* utf8_buffer, int byte_length, int byte_capacity);
|
||||
UnicodeText &UnsafeTakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
|
||||
UnicodeText &UnsafePointToUTF8(const char *utf8_buffer, int byte_length);
|
||||
UnicodeText &UnsafeAppendUTF8(const char *utf8_buffer, int byte_length);
|
||||
const_iterator UnsafeFind(const UnicodeText& look,
|
||||
const_iterator start_pos) const;
|
||||
const_iterator UnsafeFind(const UnicodeText &look, const_iterator start_pos) const;
|
||||
};
|
||||
|
||||
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
|
||||
@ -383,14 +400,12 @@ inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
|
||||
|
||||
// UnicodeTextRange is a pair of iterators, useful for specifying text
|
||||
// segments. If the iterators are ==, the segment is empty.
|
||||
typedef pair<UnicodeText::const_iterator,
|
||||
UnicodeText::const_iterator> UnicodeTextRange;
|
||||
typedef pair<UnicodeText::const_iterator, UnicodeText::const_iterator> UnicodeTextRange;
|
||||
|
||||
inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange &r) {
|
||||
return r.first == r.second;
|
||||
}
|
||||
|
||||
|
||||
// *************************** Utilities *************************
|
||||
|
||||
// A factory function for creating a UnicodeText from a buffer of
|
||||
@ -402,18 +417,17 @@ inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
|
||||
// replaced with a space, even if the codepoint was represented with a
|
||||
// multibyte sequence in the UTF-8 data.
|
||||
//
|
||||
inline UnicodeText MakeUnicodeTextAcceptingOwnership(
|
||||
char* utf8_buffer, int byte_length, int byte_capacity) {
|
||||
return UnicodeText().TakeOwnershipOfUTF8(
|
||||
utf8_buffer, byte_length, byte_capacity);
|
||||
inline UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length,
|
||||
int byte_capacity) {
|
||||
return UnicodeText().TakeOwnershipOfUTF8(utf8_buffer, byte_length, byte_capacity);
|
||||
}
|
||||
|
||||
// A factory function for creating a UnicodeText from a buffer of
|
||||
// UTF-8 data. The new UnicodeText does not take ownership of the
|
||||
// buffer. (It is an "alias.")
|
||||
//
|
||||
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
|
||||
const char* utf8_buffer, int byte_length) {
|
||||
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(const char *utf8_buffer,
|
||||
int byte_length) {
|
||||
return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
|
||||
}
|
||||
|
||||
@ -434,8 +448,7 @@ inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
|
||||
// made (as if do_copy were true) and coerced to valid UTF-8 by
|
||||
// replacing each invalid byte with a space.
|
||||
//
|
||||
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
|
||||
bool do_copy) {
|
||||
inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy) {
|
||||
UnicodeText t;
|
||||
if (do_copy) {
|
||||
t.CopyUTF8(utf8_buf, len);
|
||||
|
@ -32,8 +32,7 @@ namespace UniLib {
|
||||
// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
|
||||
bool IsInterchangeValid(char32 c) {
|
||||
return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
|
||||
(c >= 0x7F && c <= 0x9F) ||
|
||||
(c >= 0xD800 && c <= 0xDFFF) ||
|
||||
(c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) ||
|
||||
(c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE);
|
||||
}
|
||||
|
||||
@ -46,8 +45,7 @@ int SpanInterchangeValid(const char* begin, int byte_length) {
|
||||
// We want to accept Runeerror == U+FFFD as a valid char, but it is used
|
||||
// by chartorune to indicate error. Luckily, the real codepoint is size 3
|
||||
// while errors return bytes_consumed <= 1.
|
||||
if ((rune == Runeerror && bytes_consumed <= 1) ||
|
||||
!IsInterchangeValid(rune)) {
|
||||
if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) {
|
||||
break; // Found
|
||||
}
|
||||
p += bytes_consumed;
|
||||
|
@ -29,8 +29,7 @@ namespace UniLib {
|
||||
// (i.e., is not a surrogate codepoint). See also
|
||||
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
|
||||
inline bool IsValidCodepoint(char32 c) {
|
||||
return (static_cast<uint32>(c) < 0xD800)
|
||||
|| (c >= 0xE000 && c <= 0x10FFFF);
|
||||
return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
|
||||
}
|
||||
|
||||
// Returns true if 'str' is the start of a structurally valid UTF-8
|
||||
@ -41,9 +40,8 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) {
|
||||
char32 c;
|
||||
int consumed;
|
||||
// It's OK if str.length() > consumed.
|
||||
return !str.empty()
|
||||
&& isvalidcharntorune(str.data(), str.size(), &c, &consumed)
|
||||
&& IsValidCodepoint(c);
|
||||
return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) &&
|
||||
IsValidCodepoint(c);
|
||||
}
|
||||
|
||||
// Returns the length (number of bytes) of the Unicode code point
|
||||
|
@ -18,9 +18,8 @@ namespace tesseract {
|
||||
TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
|
||||
std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
// It made 3 graphemes.
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
@ -32,9 +31,8 @@ TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
|
||||
TEST(ValidateGraphemeTest, SingleConsonantOK) {
|
||||
std::string str = "\u0cb9"; // HA
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
@ -43,9 +41,8 @@ TEST(ValidateGraphemeTest, SingleConsonantOK) {
|
||||
TEST(ValidateGraphemeTest, SimpleCV) {
|
||||
std::string str = "\u0cb9\u0cbf"; // HA I
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
@ -54,15 +51,14 @@ TEST(ValidateGraphemeTest, SimpleCV) {
|
||||
TEST(ValidateGraphemeTest, SubscriptConjunct) {
|
||||
std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
|
||||
@ -71,15 +67,14 @@ TEST(ValidateGraphemeTest, SubscriptConjunct) {
|
||||
TEST(ValidateGraphemeTest, HalfFormJoiner) {
|
||||
std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
|
||||
@ -88,15 +83,14 @@ TEST(ValidateGraphemeTest, HalfFormJoiner) {
|
||||
TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
|
||||
std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
|
||||
@ -105,38 +99,36 @@ TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
|
||||
TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
|
||||
std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
|
||||
// Malaylam only, so not allowed in Telugu.
|
||||
str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
|
||||
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
}
|
||||
|
||||
TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
|
||||
std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 2);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0d24"));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
|
||||
@ -146,15 +138,14 @@ TEST(ValidateGraphemeTest, ThaiGraphemes) {
|
||||
// This is a single grapheme unless in glyph split mode
|
||||
std::string str = "\u0e14\u0e38\u0e4a";
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 1);
|
||||
EXPECT_EQ(glyphs[0], str);
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[0], std::string("\u0e14"));
|
||||
@ -164,9 +155,8 @@ TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
|
||||
std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
|
||||
std::vector<std::string> glyphs;
|
||||
// Returns true, but the joiner is gone.
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs))
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(glyphs.size(), 5);
|
||||
EXPECT_EQ(glyphs[0], std::string("'"));
|
||||
|
@ -30,17 +30,15 @@ TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
|
||||
std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ
|
||||
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
|
||||
// Same result if we started with the normalized string.
|
||||
ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1,
|
||||
target_str);
|
||||
ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
|
||||
}
|
||||
|
||||
// Only one dependent vowel is allowed.
|
||||
TEST(ValidateIndicTest, OnlyOneDependentVowel) {
|
||||
std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU
|
||||
std::string dest;
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&dest))
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
}
|
||||
|
||||
@ -55,22 +53,19 @@ TEST(ValidateIndicTest, OnlyOneDependentVowel) {
|
||||
TEST(ValidateIndicTest, OnlyOneVowelModifier) {
|
||||
std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu
|
||||
std::string result;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
// It made 1 grapheme of 4 chars, by terminating the explicit virama.
|
||||
EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);
|
||||
|
||||
str = "\u0995\u0983\u0981"; // KA visarga candrabindu
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
|
||||
// Exception: Malayalam allows multiple anusvara.
|
||||
str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
EXPECT_EQ(str, result);
|
||||
}
|
||||
|
||||
@ -85,14 +80,13 @@ TEST(ValidateIndicTest, OnlyOneVowelModifier) {
|
||||
TEST(ValidateIndicTest, VowelModifierMustBeLast) {
|
||||
std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I
|
||||
std::string dest;
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&dest))
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
// Swap c02/c3f and all is ok.
|
||||
str = "\u0c28\u0c3f\u0c02"; // NA I Sunna
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
}
|
||||
@ -108,13 +102,12 @@ TEST(ValidateIndicTest, VowelModifierMustBeLast) {
|
||||
TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
|
||||
std::string str = "\u0c05\u0c47"; // A EE
|
||||
std::string dest;
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&dest))
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
str = "\u0c1e\u0c3e"; // NYA AA
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
}
|
||||
@ -123,12 +116,11 @@ TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
|
||||
TEST(ValidateIndicTest, SubGraphemes) {
|
||||
std::string str = "\u0d3e"; // AA
|
||||
std::string dest;
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&dest))
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNone, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
}
|
||||
@ -136,9 +128,9 @@ TEST(ValidateIndicTest, SubGraphemes) {
|
||||
TEST(ValidateIndicTest, Nukta) {
|
||||
std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));
|
||||
// Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
|
||||
@ -150,20 +142,20 @@ TEST(ValidateIndicTest, Nukta) {
|
||||
TEST(ValidateIndicTest, SinhalaRakaransaya) {
|
||||
std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna
|
||||
std::string dest;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 2);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));
|
||||
// Can be followed by a dependent vowel.
|
||||
str += "\u0dd9"; // E
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
}
|
||||
@ -171,20 +163,20 @@ TEST(ValidateIndicTest, SinhalaRakaransaya) {
|
||||
TEST(ValidateIndicTest, SinhalaYansaya) {
|
||||
std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna
|
||||
std::string dest;
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
// Can be followed by a dependent vowel.
|
||||
str += "\u0ddd"; // OO
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &dest))
|
||||
<< PrintString32WithUnicodes(str);
|
||||
EXPECT_EQ(dest, str);
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));
|
||||
}
|
||||
@ -192,14 +184,14 @@ TEST(ValidateIndicTest, SinhalaYansaya) {
|
||||
TEST(ValidateIndicTest, SinhalaRepaya) {
|
||||
std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 2);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 3);
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
|
||||
}
|
||||
@ -208,9 +200,9 @@ TEST(ValidateIndicTest, SinhalaSpecials) {
|
||||
// Sinhala has some exceptions from the usual rules.
|
||||
std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(glyphs[0], std::string("\u0dc0"));
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0d9c"));
|
||||
@ -218,9 +210,9 @@ TEST(ValidateIndicTest, SinhalaSpecials) {
|
||||
EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));
|
||||
EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));
|
||||
str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
|
||||
EXPECT_EQ(glyphs[0], std::string("\u0dc3"));
|
||||
EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
|
||||
|
@ -32,19 +32,16 @@ TEST(ValidateKhmerTest, BadKhmerWords) {
|
||||
std::string result;
|
||||
// Multiple dependent vowels not allowed
|
||||
std::string str = "\u1796\u17b6\u17b7";
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
// Multiple shifters not allowed
|
||||
str = "\u1798\u17c9\u17ca";
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
// Multiple signs not allowed
|
||||
str = "\u1780\u17b6\u17cb\u17cd";
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -27,27 +27,25 @@ TEST(ValidateMyanmarTest, GoodMyanmarWords) {
|
||||
TEST(ValidateMyanmarTest, BadMyanmarWords) {
|
||||
std::string str = "က်န္းမာေရး";
|
||||
std::vector<std::string> glyphs;
|
||||
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
||||
str.c_str(), &glyphs));
|
||||
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kCombined, true, str.c_str(),
|
||||
&glyphs));
|
||||
std::string result;
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
// It works if the grapheme normalization is turned off.
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNone, str.c_str(), &result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone,
|
||||
str.c_str(), &result));
|
||||
EXPECT_EQ(str, result);
|
||||
str = "ခုႏွစ္";
|
||||
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
|
||||
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
||||
true, str.c_str(), &glyphs));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNormalize, str.c_str(),
|
||||
&result));
|
||||
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNormMode::kGlyphSplit, true, str.c_str(),
|
||||
&glyphs));
|
||||
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize,
|
||||
str.c_str(), &result));
|
||||
// It works if the grapheme normalization is turned off.
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
||||
GraphemeNorm::kNone, str.c_str(), &result));
|
||||
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone,
|
||||
str.c_str(), &result));
|
||||
EXPECT_EQ(str, result);
|
||||
}
|
||||
|
||||
|
@ -18,8 +18,7 @@ namespace tesseract {
|
||||
|
||||
class TestableValidator : public Validator {
|
||||
public:
|
||||
static ViramaScript TestableMostFrequentViramaScript(
|
||||
const std::vector<char32>& utf32) {
|
||||
static ViramaScript TestableMostFrequentViramaScript(const std::vector<char32> &utf32) {
|
||||
return MostFrequentViramaScript(utf32);
|
||||
}
|
||||
};
|
||||
@ -29,46 +28,38 @@ class TestableValidator : public Validator {
|
||||
TEST(ValidatorTest, MostFrequentViramaScript) {
|
||||
// The most frequent virama script should come out correct, despite
|
||||
// distractions from other scripts.
|
||||
EXPECT_EQ(ViramaScript::kTelugu,
|
||||
TestableValidator::TestableMostFrequentViramaScript({0xc05}));
|
||||
EXPECT_EQ(ViramaScript::kTelugu, TestableValidator::TestableMostFrequentViramaScript({0xc05}));
|
||||
// It is still Telugu surrounded by Latin.
|
||||
EXPECT_EQ(ViramaScript::kTelugu,
|
||||
TestableValidator::TestableMostFrequentViramaScript(
|
||||
{'a', 0xc05, 'b', 'c'}));
|
||||
TestableValidator::TestableMostFrequentViramaScript({'a', 0xc05, 'b', 'c'}));
|
||||
// But not still Telugu surrounded by Devanagari.
|
||||
EXPECT_EQ(ViramaScript::kDevanagari,
|
||||
TestableValidator::TestableMostFrequentViramaScript(
|
||||
{0x905, 0xc05, 0x906, 0x907}));
|
||||
TestableValidator::TestableMostFrequentViramaScript({0x905, 0xc05, 0x906, 0x907}));
|
||||
EXPECT_EQ(ViramaScript::kKannada,
|
||||
TestableValidator::TestableMostFrequentViramaScript(
|
||||
{0xc85, 0xc05, 0xc86, 0xc87}));
|
||||
TestableValidator::TestableMostFrequentViramaScript({0xc85, 0xc05, 0xc86, 0xc87}));
|
||||
EXPECT_EQ(ViramaScript::kBengali,
|
||||
TestableValidator::TestableMostFrequentViramaScript(
|
||||
{0x985, 0xc05, 0x986, 0x987}));
|
||||
TestableValidator::TestableMostFrequentViramaScript({0x985, 0xc05, 0x986, 0x987}));
|
||||
// Danda and double Danda don't count as Devanagari, as they are common.
|
||||
EXPECT_EQ(ViramaScript::kTelugu,
|
||||
TestableValidator::TestableMostFrequentViramaScript(
|
||||
{0x964, 0xc05, 0x965, 0x965}));
|
||||
TestableValidator::TestableMostFrequentViramaScript({0x964, 0xc05, 0x965, 0x965}));
|
||||
}
|
||||
|
||||
// ValidateCleanAndSegment doesn't modify the input by much, but its
|
||||
// transformation should be idempotent. (Doesn't change again if re-applied.)
|
||||
TEST(ValidatorTest, Idempotency) {
|
||||
std::vector<char32> str1(
|
||||
{0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c});
|
||||
std::vector<char32> str2(
|
||||
{0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''});
|
||||
std::vector<char32> str1({0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c});
|
||||
std::vector<char32> str2({0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''});
|
||||
std::vector<std::vector<char32>> result1, result2, result3, result4;
|
||||
EXPECT_TRUE(Validator::ValidateCleanAndSegment(
|
||||
GraphemeNormMode::kSingleString, true, str1, &result1));
|
||||
EXPECT_TRUE(Validator::ValidateCleanAndSegment(
|
||||
GraphemeNormMode::kSingleString, true, result1[0], &result2));
|
||||
EXPECT_TRUE(
|
||||
Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str1, &result1));
|
||||
EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result1[0],
|
||||
&result2));
|
||||
EXPECT_EQ(result1.size(), result2.size());
|
||||
EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0]));
|
||||
EXPECT_TRUE(Validator::ValidateCleanAndSegment(
|
||||
GraphemeNormMode::kSingleString, true, str2, &result3));
|
||||
EXPECT_TRUE(Validator::ValidateCleanAndSegment(
|
||||
GraphemeNormMode::kSingleString, true, result3[0], &result4));
|
||||
EXPECT_TRUE(
|
||||
Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, str2, &result3));
|
||||
EXPECT_TRUE(Validator::ValidateCleanAndSegment(GraphemeNormMode::kSingleString, true, result3[0],
|
||||
&result4));
|
||||
EXPECT_EQ(result3.size(), result4.size());
|
||||
EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0]));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user