tesseract/unittest/tatweel_test.cc

#include "tesseract/ccutil/unicharset.h"
#include "tesseract/dict/dawg.h"
#include "tesseract/dict/trie.h"
#include "util/utf8/public/unicodetext.h"

namespace {

class TatweelTest : public ::testing::Test {
 protected:
  TatweelTest() {
    string filename = TestDataNameToPath("ara.wordlist");
    string wordlist;
    CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
    // Put all the unicodes in the unicharset_.
    UnicodeText text;
    text.PointToUTF8(wordlist.data(), wordlist.size());
    int num_tatweel = 0;
    for (auto it = text.begin(); it != text.end(); ++it) {
      string utf8 = it.get_utf8_string();
      if (utf8.find(u8"\u0640") != string::npos) ++num_tatweel;
      unicharset_.unichar_insert(utf8.c_str());
    }
    LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
    EXPECT_GT(num_tatweel, 0);
  }

  string TestDataNameToPath(const string& name) {
    return file::JoinPath(FLAGS_test_srcdir, "testdata/" + name);
  }
  UNICHARSET unicharset_;
};

TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
  // This test verifies that the unicharset ignores the Tatweel character.
  for (int i = 0; i < unicharset_.size(); ++i) {
    const char* utf8 = unicharset_.id_to_unichar(i);
    EXPECT_EQ(strstr(utf8, u8"\u0640"), nullptr);
  }
}

TEST_F(TatweelTest, DictIgnoresTatweel) {
  // This test verifies that the dictionary ignores the Tatweel character.
  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM,
                       unicharset_.size(), 0);
  string filename = TestDataNameToPath("ara.wordlist");
  EXPECT_TRUE(trie.read_and_add_word_list(
      filename.c_str(), unicharset_, tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));
  EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
}

TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
  // This test verifies that a load of an existing unicharset keeps any
  // existing tatweel for backwards compatability.
  string filename = TestDataNameToPath("ara.unicharset");
  EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
  int num_tatweel = 0;
  for (int i = 0; i < unicharset_.size(); ++i) {
    const char* utf8 = unicharset_.id_to_unichar(i);
    if (strstr(utf8, u8"\u0640") != nullptr) ++num_tatweel;
  }
  LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
  EXPECT_EQ(num_tatweel, 4);
}

}  // namespace
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`#include "tesseract/ccutil/unicharset.h"`
			`#include "tesseract/dict/dawg.h"`
			`#include "tesseract/dict/trie.h"`
			`#include "util/utf8/public/unicodetext.h"`

			`namespace {`

			`class TatweelTest : public ::testing::Test {`
			`protected:`
			`TatweelTest() {`
			`string filename = TestDataNameToPath("ara.wordlist");`
			`string wordlist;`
			`CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));`
			`// Put all the unicodes in the unicharset_.`
			`UnicodeText text;`
			`text.PointToUTF8(wordlist.data(), wordlist.size());`
			`int num_tatweel = 0;`
			`for (auto it = text.begin(); it != text.end(); ++it) {`
			`string utf8 = it.get_utf8_string();`
			`if (utf8.find(u8"\u0640") != string::npos) ++num_tatweel;`
			`unicharset_.unichar_insert(utf8.c_str());`
			`}`
			`LOG(INFO) << "Num tatweels in source data=" << num_tatweel;`
			`EXPECT_GT(num_tatweel, 0);`
			`}`

			`string TestDataNameToPath(const string& name) {`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`return file::JoinPath(FLAGS_test_srcdir, "testdata/" + name);`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`}`
			`UNICHARSET unicharset_;`
			`};`

			`TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {`
			`// This test verifies that the unicharset ignores the Tatweel character.`
			`for (int i = 0; i < unicharset_.size(); ++i) {`
			`const char* utf8 = unicharset_.id_to_unichar(i);`
			`EXPECT_EQ(strstr(utf8, u8"\u0640"), nullptr);`
			`}`
			`}`

			`TEST_F(TatweelTest, DictIgnoresTatweel) {`
			`// This test verifies that the dictionary ignores the Tatweel character.`
			`tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM,`
			`unicharset_.size(), 0);`
			`string filename = TestDataNameToPath("ara.wordlist");`
			`EXPECT_TRUE(trie.read_and_add_word_list(`
			`filename.c_str(), unicharset_, tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));`
			`EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));`
			`}`

			`TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {`
			`// This test verifies that a load of an existing unicharset keeps any`
			`// existing tatweel for backwards compatability.`
			`string filename = TestDataNameToPath("ara.unicharset");`
			`EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));`
			`int num_tatweel = 0;`
			`for (int i = 0; i < unicharset_.size(); ++i) {`
			`const char* utf8 = unicharset_.id_to_unichar(i);`
			`if (strstr(utf8, u8"\u0640") != nullptr) ++num_tatweel;`
			`}`
			`LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;`
			`EXPECT_EQ(num_tatweel, 4);`
			`}`

			`} // namespace`