2019-06-24 18:52:06 +08:00
|
|
|
// (C) Copyright 2017, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
#if defined(_WIN32)
|
2021-03-13 05:06:34 +08:00
|
|
|
# include <io.h> // for _access
|
2019-06-24 18:52:06 +08:00
|
|
|
#else
|
2021-03-13 05:06:34 +08:00
|
|
|
# include <unistd.h> // for access
|
2019-06-24 18:52:06 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "dawg.h"
|
2021-03-13 05:06:34 +08:00
|
|
|
#include "include_gunit.h"
|
2019-06-24 18:52:06 +08:00
|
|
|
#include "trie.h"
|
|
|
|
#include "unicharset.h"
|
2021-08-08 18:10:20 +08:00
|
|
|
#include "util/utf8/unicodetext.h" // for UnicodeText
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2020-12-27 17:41:48 +08:00
|
|
|
namespace tesseract {
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2019-06-24 18:52:06 +08:00
|
|
|
// Replacement for std::filesystem::exists (C++-17)
|
2021-03-13 05:06:34 +08:00
|
|
|
static bool file_exists(const char *filename) {
|
2019-06-24 18:52:06 +08:00
|
|
|
#if defined(_WIN32)
|
|
|
|
return _access(filename, 0) == 0;
|
|
|
|
#else
|
|
|
|
return access(filename, 0) == 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2018-08-24 21:07:48 +08:00
|
|
|
class TatweelTest : public ::testing::Test {
|
2021-03-13 05:06:34 +08:00
|
|
|
protected:
|
2019-06-24 18:52:06 +08:00
|
|
|
void SetUp() override {
|
|
|
|
static std::locale system_locale("");
|
|
|
|
std::locale::global(system_locale);
|
2019-05-17 00:12:06 +08:00
|
|
|
}
|
|
|
|
|
2018-08-24 21:07:48 +08:00
|
|
|
TatweelTest() {
|
2019-06-24 18:52:06 +08:00
|
|
|
std::string filename = TestDataNameToPath("ara.wordlist");
|
|
|
|
if (file_exists(filename.c_str())) {
|
2021-08-08 18:10:20 +08:00
|
|
|
std::string wordlist("\u0640");
|
2019-06-24 18:52:06 +08:00
|
|
|
CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
|
|
|
|
// Put all the unicodes in the unicharset_.
|
|
|
|
UnicodeText text;
|
|
|
|
text.PointToUTF8(wordlist.data(), wordlist.size());
|
|
|
|
int num_tatweel = 0;
|
|
|
|
for (auto it = text.begin(); it != text.end(); ++it) {
|
|
|
|
std::string utf8 = it.get_utf8_string();
|
2021-08-08 18:10:20 +08:00
|
|
|
if (utf8.find("\u0640") != std::string::npos)
|
2021-03-13 05:06:34 +08:00
|
|
|
++num_tatweel;
|
2019-06-24 18:52:06 +08:00
|
|
|
unicharset_.unichar_insert(utf8.c_str());
|
|
|
|
}
|
|
|
|
LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
|
|
|
|
EXPECT_GT(num_tatweel, 0);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
std::string TestDataNameToPath(const std::string &name) {
|
2019-06-24 18:52:06 +08:00
|
|
|
return file::JoinPath(TESTDATA_DIR, name);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
UNICHARSET unicharset_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
|
|
|
|
// This test verifies that the unicharset ignores the Tatweel character.
|
|
|
|
for (int i = 0; i < unicharset_.size(); ++i) {
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *utf8 = unicharset_.id_to_unichar(i);
|
|
|
|
EXPECT_EQ(strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")), nullptr);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TatweelTest, DictIgnoresTatweel) {
|
|
|
|
// This test verifies that the dictionary ignores the Tatweel character.
|
2021-03-13 05:06:34 +08:00
|
|
|
tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0);
|
2019-06-24 18:52:06 +08:00
|
|
|
std::string filename = TestDataNameToPath("ara.wordlist");
|
|
|
|
if (!file_exists(filename.c_str())) {
|
|
|
|
LOG(INFO) << "Skip test because of missing " << filename;
|
|
|
|
GTEST_SKIP();
|
|
|
|
} else {
|
2021-03-13 05:06:34 +08:00
|
|
|
EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_,
|
|
|
|
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));
|
2019-06-24 18:52:06 +08:00
|
|
|
EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
|
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
|
|
|
|
// This test verifies that a load of an existing unicharset keeps any
|
2018-12-11 01:52:47 +08:00
|
|
|
// existing tatweel for backwards compatibility.
|
2019-06-24 18:52:06 +08:00
|
|
|
std::string filename = TestDataNameToPath("ara.unicharset");
|
|
|
|
if (!file_exists(filename.c_str())) {
|
|
|
|
LOG(INFO) << "Skip test because of missing " << filename;
|
|
|
|
GTEST_SKIP();
|
|
|
|
} else {
|
|
|
|
EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
|
|
|
|
int num_tatweel = 0;
|
|
|
|
for (int i = 0; i < unicharset_.size(); ++i) {
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *utf8 = unicharset_.id_to_unichar(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
if (strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")) != nullptr) {
|
2021-03-13 05:06:34 +08:00
|
|
|
++num_tatweel;
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2019-06-24 18:52:06 +08:00
|
|
|
}
|
|
|
|
LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
|
|
|
|
EXPECT_EQ(num_tatweel, 4);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
} // namespace tesseract
|