2019-01-21 18:55:36 +08:00
|
|
|
// (C) Copyright 2017, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2020-12-30 08:33:38 +08:00
|
|
|
#include "include_gunit.h"
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2019-01-20 21:54:34 +08:00
|
|
|
#include "ratngs.h"
|
|
|
|
#include "trie.h"
|
2021-03-13 05:06:34 +08:00
|
|
|
#include "unicharset.h"
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <cstdlib> // for system
|
|
|
|
#include <fstream> // for ifstream
|
2020-12-30 08:33:38 +08:00
|
|
|
#include <set>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2020-12-30 07:38:11 +08:00
|
|
|
#ifndef SW_TESTING
|
2021-03-13 05:06:34 +08:00
|
|
|
# define wordlist2dawg_prog "wordlist2dawg"
|
|
|
|
# define dawg2wordlist_prog "dawg2wordlist"
|
2020-12-30 07:38:11 +08:00
|
|
|
#endif
|
|
|
|
|
2020-12-27 17:41:48 +08:00
|
|
|
namespace tesseract {
|
2018-08-24 21:07:48 +08:00
|
|
|
|
|
|
|
// Test some basic functionality dealing with Dawgs (compressed dictionaries,
|
|
|
|
// aka Directed Acyclic Word Graphs).
|
|
|
|
class DawgTest : public testing::Test {
|
2021-03-13 05:06:34 +08:00
|
|
|
protected:
|
2021-03-22 15:26:05 +08:00
|
|
|
void SetUp() override {
|
2019-05-17 00:12:06 +08:00
|
|
|
std::locale::global(std::locale(""));
|
2020-12-31 01:17:58 +08:00
|
|
|
file::MakeTmpdir();
|
2019-05-17 00:12:06 +08:00
|
|
|
}
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
void LoadWordlist(const std::string &filename, std::set<std::string> *words) const {
|
2019-01-21 18:47:10 +08:00
|
|
|
std::ifstream file(filename);
|
|
|
|
if (file.is_open()) {
|
|
|
|
std::string line;
|
|
|
|
while (getline(file, line)) {
|
|
|
|
// Remove trailing line terminators from line.
|
|
|
|
while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
|
|
|
|
line.resize(line.size() - 1);
|
|
|
|
}
|
|
|
|
// Add line to set.
|
|
|
|
words->insert(line.c_str());
|
|
|
|
}
|
|
|
|
file.close();
|
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
std::string TessBinaryPath(const std::string &name) const {
|
2020-12-21 21:58:25 +08:00
|
|
|
return file::JoinPath(TESSBIN_DIR, name);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
std::string OutputNameToPath(const std::string &name) const {
|
2018-08-24 21:07:48 +08:00
|
|
|
return file::JoinPath(FLAGS_test_tmpdir, name);
|
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2,
|
|
|
|
const std::string &arg3) const {
|
|
|
|
std::string cmdline = TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
|
2019-01-21 18:47:10 +08:00
|
|
|
return system(cmdline.c_str());
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
// Test that we are able to convert a wordlist file (one "word" per line) to
|
|
|
|
// a dawg (a compressed format) and then extract the original wordlist back
|
|
|
|
// out using the tools "wordlist2dawg" and "dawg2wordlist."
|
2021-03-13 05:06:34 +08:00
|
|
|
void TestDawgRoundTrip(const std::string &unicharset_filename,
|
|
|
|
const std::string &wordlist_filename) const {
|
2019-01-20 21:54:34 +08:00
|
|
|
std::set<std::string> orig_words, roundtrip_words;
|
2019-01-22 01:00:08 +08:00
|
|
|
std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
|
|
|
|
std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
|
2019-01-20 21:54:34 +08:00
|
|
|
std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
|
|
|
|
std::string output_wordlist = OutputNameToPath(wordlist_filename);
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadWordlist(orig_wordlist, &orig_words);
|
2021-03-13 05:06:34 +08:00
|
|
|
EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
|
|
|
|
EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0);
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadWordlist(output_wordlist, &roundtrip_words);
|
|
|
|
EXPECT_EQ(orig_words, roundtrip_words);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(DawgTest, TestDawgConversion) {
|
|
|
|
TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DawgTest, TestMatching) {
|
|
|
|
UNICHARSET unicharset;
|
2019-01-22 01:00:08 +08:00
|
|
|
unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
|
2021-03-13 05:06:34 +08:00
|
|
|
tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, unicharset.size(), 0);
|
2018-08-24 21:07:48 +08:00
|
|
|
WERD_CHOICE space_apos(" '", unicharset);
|
|
|
|
trie.add_word_to_dawg(space_apos);
|
|
|
|
|
|
|
|
WERD_CHOICE space(" ", unicharset);
|
|
|
|
|
|
|
|
// partial match ok - then good!
|
|
|
|
EXPECT_TRUE(trie.prefix_in_dawg(space, false));
|
|
|
|
// require complete match - not present.
|
|
|
|
EXPECT_FALSE(trie.word_in_dawg(space));
|
|
|
|
EXPECT_FALSE(trie.prefix_in_dawg(space, true));
|
|
|
|
|
|
|
|
// partial or complete match ok for full word:
|
|
|
|
EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
|
|
|
|
EXPECT_TRUE(trie.word_in_dawg(space_apos));
|
|
|
|
EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
|
|
|
|
}
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
} // namespace tesseract
|