tesseract/unittest/dawg_test.cc


#include <set>
#include <string>
#include <vector>

#include "util/process/subprocess.h"

#include "tesseract/ccstruct/ratngs.h"
#include "tesseract/ccutil/unicharset.h"
#include "tesseract/dict/trie.h"

namespace {

void RemoveTrailingLineTerminators(char* line) {
  char* end = line + strlen(line) - 1;
  while (end >= line && ('\n' == *end || '\r' == *end)) {
    *end-- = 0;
  }
}

void AddLineToSet(std::set<string>* words, char* line) {
  RemoveTrailingLineTerminators(line);
  words->insert(line);
}

// Test some basic functionality dealing with Dawgs (compressed dictionaries,
// aka Directed Acyclic Word Graphs).
class DawgTest : public testing::Test {
 protected:
  void LoadWordlist(const string& filename, std::set<string>* words) const {
    FileLineReader::Options options;
    options.set_comment_char(0);
    FileLineReader flr(filename.c_str(), options);
    flr.set_line_callback(NewPermanentCallback(AddLineToSet, words));
    flr.Reload();
  }
  string TestDataNameToPath(const string& name) const {
    return file::JoinPath(FLAGS_test_srcdir, "testdata/" + name);
  }
  string TessBinaryPath(const string& binary_name) const {
    return file::JoinPath(FLAGS_test_srcdir,
  }
  string OutputNameToPath(const string& name) const {
    return file::JoinPath(FLAGS_test_tmpdir, name);
  }
  int RunCommand(const string& program, const string& arg1, const string& arg2,
                 const string& arg3) const {
    SubProcess p;
    std::vector<string> argv;
    argv.push_back(program);
    argv.push_back(arg1);
    argv.push_back(arg2);
    argv.push_back(arg3);
    p.SetProgram(TessBinaryPath(program), argv);
    p.Start();
    p.Wait();
    return p.exit_code();
  }
  // Test that we are able to convert a wordlist file (one "word" per line) to
  // a dawg (a compressed format) and then extract the original wordlist back
  // out using the tools "wordlist2dawg" and "dawg2wordlist."
  void TestDawgRoundTrip(const string& unicharset_filename,
                         const string& wordlist_filename) const {
    std::set<string> orig_words, roundtrip_words;
    string unicharset = TestDataNameToPath(unicharset_filename);
    string orig_wordlist = TestDataNameToPath(wordlist_filename);
    string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
    string output_wordlist = OutputNameToPath(wordlist_filename);
    LoadWordlist(orig_wordlist, &orig_words);
    EXPECT_EQ(
        RunCommand("wordlist2dawg", orig_wordlist, output_dawg, unicharset), 0);
    EXPECT_EQ(
        RunCommand("dawg2wordlist", unicharset, output_dawg, output_wordlist),
        0);
    LoadWordlist(output_wordlist, &roundtrip_words);
    EXPECT_EQ(orig_words, roundtrip_words);
  }
};

TEST_F(DawgTest, TestDawgConversion) {
  TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
}

TEST_F(DawgTest, TestMatching) {
  UNICHARSET unicharset;
  unicharset.load_from_file(TestDataNameToPath("eng.unicharset").c_str());
  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM,
                       unicharset.size(), 0);
  WERD_CHOICE space_apos(" '", unicharset);
  trie.add_word_to_dawg(space_apos);

  WERD_CHOICE space(" ", unicharset);

  // partial match ok - then good!
  EXPECT_TRUE(trie.prefix_in_dawg(space, false));
  // require complete match - not present.
  EXPECT_FALSE(trie.word_in_dawg(space));
  EXPECT_FALSE(trie.prefix_in_dawg(space, true));

  // partial or complete match ok for full word:
  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
  EXPECT_TRUE(trie.word_in_dawg(space_apos));
  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
}

}  // namespace
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00
			`#include <set>`
			`#include <string>`
			`#include <vector>`

			`#include "util/process/subprocess.h"`

			`#include "tesseract/ccstruct/ratngs.h"`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`#include "tesseract/ccutil/unicharset.h"`
			`#include "tesseract/dict/trie.h"`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00
			`namespace {`

unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`void RemoveTrailingLineTerminators(char* line) {`
			`char* end = line + strlen(line) - 1;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`while (end >= line && ('\n' == end \|\| '\r' == end)) {`
			`*end-- = 0;`
			`}`
			`}`

unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`void AddLineToSet(std::set<string>* words, char* line) {`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`RemoveTrailingLineTerminators(line);`
			`words->insert(line);`
			`}`

			`// Test some basic functionality dealing with Dawgs (compressed dictionaries,`
			`// aka Directed Acyclic Word Graphs).`
			`class DawgTest : public testing::Test {`
			`protected:`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`void LoadWordlist(const string& filename, std::set<string>* words) const {`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`FileLineReader::Options options;`
			`options.set_comment_char(0);`
			`FileLineReader flr(filename.c_str(), options);`
			`flr.set_line_callback(NewPermanentCallback(AddLineToSet, words));`
			`flr.Reload();`
			`}`
			`string TestDataNameToPath(const string& name) const {`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`return file::JoinPath(FLAGS_test_srcdir, "testdata/" + name);`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`}`
			`string TessBinaryPath(const string& binary_name) const {`
			`return file::JoinPath(FLAGS_test_srcdir,`
			`}`
			`string OutputNameToPath(const string& name) const {`
			`return file::JoinPath(FLAGS_test_tmpdir, name);`
			`}`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`int RunCommand(const string& program, const string& arg1, const string& arg2,`
			`const string& arg3) const {`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`SubProcess p;`
			`std::vector<string> argv;`
			`argv.push_back(program);`
			`argv.push_back(arg1);`
			`argv.push_back(arg2);`
			`argv.push_back(arg3);`
			`p.SetProgram(TessBinaryPath(program), argv);`
			`p.Start();`
			`p.Wait();`
			`return p.exit_code();`
			`}`
			`// Test that we are able to convert a wordlist file (one "word" per line) to`
			`// a dawg (a compressed format) and then extract the original wordlist back`
			`// out using the tools "wordlist2dawg" and "dawg2wordlist."`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`void TestDawgRoundTrip(const string& unicharset_filename,`
			`const string& wordlist_filename) const {`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`std::set<string> orig_words, roundtrip_words;`
			`string unicharset = TestDataNameToPath(unicharset_filename);`
			`string orig_wordlist = TestDataNameToPath(wordlist_filename);`
			`string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");`
			`string output_wordlist = OutputNameToPath(wordlist_filename);`
			`LoadWordlist(orig_wordlist, &orig_words);`
			`EXPECT_EQ(`
unittest: Format code It was formatted with clang-format-7 -i unittest/.{c,h}. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-09-29 15:19:13 +08:00			`RunCommand("wordlist2dawg", orig_wordlist, output_dawg, unicharset), 0);`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_EQ(`
			`RunCommand("dawg2wordlist", unicharset, output_dawg, output_wordlist),`
			`0);`
			`LoadWordlist(output_wordlist, &roundtrip_words);`
			`EXPECT_EQ(orig_words, roundtrip_words);`
			`}`
			`};`

			`TEST_F(DawgTest, TestDawgConversion) {`
			`TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");`
			`}`

			`TEST_F(DawgTest, TestMatching) {`
			`UNICHARSET unicharset;`
			`unicharset.load_from_file(TestDataNameToPath("eng.unicharset").c_str());`
			`tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM,`
			`unicharset.size(), 0);`
			`WERD_CHOICE space_apos(" '", unicharset);`
			`trie.add_word_to_dawg(space_apos);`

			`WERD_CHOICE space(" ", unicharset);`

			`// partial match ok - then good!`
			`EXPECT_TRUE(trie.prefix_in_dawg(space, false));`
			`// require complete match - not present.`
			`EXPECT_FALSE(trie.word_in_dawg(space));`
			`EXPECT_FALSE(trie.prefix_in_dawg(space, true));`

			`// partial or complete match ok for full word:`
			`EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));`
			`EXPECT_TRUE(trie.word_in_dawg(space_apos));`
			`EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));`
			`}`

			`} // namespace`