From 07ca24aeaffc44140094b3b93cb51d22e39f4dba Mon Sep 17 00:00:00 2001 From: "theraysmith@gmail.com" Date: Mon, 3 Feb 2014 19:18:23 +0000 Subject: [PATCH] Removed upper limit on trie size, fixing issue 1020. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- cube/word_list_lang_model.cpp | 1 - cube/word_list_lang_model.h | 1 - dict/dict.cpp | 12 ++++-------- dict/dict.h | 3 --- dict/trie.cpp | 1 - dict/trie.h | 4 +--- training/wordlist2dawg.cpp | 5 +---- 7 files changed, 6 insertions(+), 21 deletions(-) diff --git a/cube/word_list_lang_model.cpp b/cube/word_list_lang_model.cpp index 8537d256a..67a6a5a98 100644 --- a/cube/word_list_lang_model.cpp +++ b/cube/word_list_lang_model.cpp @@ -53,7 +53,6 @@ bool WordListLangModel::Init() { // The last parameter to the Trie constructor (the debug level) is set to // false for now, until Cube has a way to express its preferred debug level. dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM, - WordListLangModel::kMaxDawgEdges, cntxt_->CharacterSet()->ClassCount(), false); if (dawg_ == NULL) { return false; diff --git a/cube/word_list_lang_model.h b/cube/word_list_lang_model.h index a975e3c5c..099d62949 100644 --- a/cube/word_list_lang_model.h +++ b/cube/word_list_lang_model.h @@ -69,7 +69,6 @@ class WordListLangModel : public LangModel { private: // constants needed to configure the language model static const int kMaxEdge = 512; - static const int kMaxDawgEdges = 20000; CubeRecoContext *cntxt_; Trie *dawg_; diff --git a/dict/dict.cpp b/dict/dict.cpp index 0d652514d..6165eb062 100644 --- a/dict/dict.cpp +++ b/dict/dict.cpp @@ -239,8 +239,7 @@ void Dict::Load(DawgCache *dawg_cache) { if (((STRING &)user_words_suffix).length() > 0) { Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, - kMaxUserDawgEdges, getUnicharset().size(), - dawg_debug_level); + getUnicharset().size(), dawg_debug_level); name = getCCUtil()->language_data_path_prefix; name += user_words_suffix; if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), @@ -254,8 +253,7 @@ void Dict::Load(DawgCache *dawg_cache) { if (((STRING &)user_patterns_suffix).length() > 0) { Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, - kMaxUserDawgEdges, getUnicharset().size(), - dawg_debug_level); + getUnicharset().size(), dawg_debug_level); trie_ptr->initialize_patterns(&(getUnicharset())); name = getCCUtil()->language_data_path_prefix; name += user_patterns_suffix; @@ -268,14 +266,12 @@ void Dict::Load(DawgCache *dawg_cache) { } document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, - kMaxDocDawgEdges, getUnicharset().size(), - dawg_debug_level); + getUnicharset().size(), dawg_debug_level); dawgs_ += document_words_; // This dawg is temporary and should not be searched by letter_is_ok. pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, - kMaxDocDawgEdges, getUnicharset().size(), - dawg_debug_level); + getUnicharset().size(), dawg_debug_level); // Construct a list of corresponding successors for each dawg. Each entry i // in the successors_ vector is a vector of integers that represent the diff --git a/dict/dict.h b/dict/dict.h index 2e53ad56a..e95a3e5d7 100644 --- a/dict/dict.h +++ b/dict/dict.h @@ -61,9 +61,6 @@ static const char kHyphenSymbol[] = "-"; static const char kSlashSymbol[] = "/"; static const char kQuestionSymbol[] = "?"; static const char kApostropheSymbol[] = "'"; -static const int kMaxNumDawgEdgees = 2000000; -static const int kMaxDocDawgEdges = 250000; -static const int kMaxUserDawgEdges = 50000; static const float kSimCertaintyScale = -10.0; // similarity matcher scaling static const float kSimCertaintyOffset = -10.0; // similarity matcher offset static const float kSimilarityFloor = 100.0; // worst E*L product to stop on diff --git a/dict/trie.cpp b/dict/trie.cpp index 189c8c9b7..e2fd49085 100644 --- a/dict/trie.cpp +++ b/dict/trie.cpp @@ -125,7 +125,6 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, int direction, bool word_end, UNICHAR_ID unichar_id) { - if (num_edges_ == max_num_edges_) return false; EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); int search_index; diff --git a/dict/trie.h b/dict/trie.h index ba60b4e25..4a89debfb 100644 --- a/dict/trie.h +++ b/dict/trie.h @@ -87,10 +87,9 @@ class Trie : public Dawg { // contain more edges than max_num_edges, all the edges are cleared // so that new inserts can proceed). Trie(DawgType type, const STRING &lang, PermuterType perm, - uinT64 max_num_edges, int unicharset_size, int debug_level) { + int unicharset_size, int debug_level) { init(type, lang, perm, unicharset_size, debug_level); num_edges_ = 0; - max_num_edges_ = max_num_edges; deref_node_index_mask_ = ~letter_mask_; new_dawg_node(); // need to allocate node 0 initialized_patterns_ = false; @@ -415,7 +414,6 @@ class Trie : public Dawg { // Member variables TRIE_NODES nodes_; // vector of nodes in the Trie uinT64 num_edges_; // sum of all edges (forward and backward) - uinT64 max_num_edges_; // maximum number of edges allowed uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index // Freelist of edges in the root backwards node that were previously zeroed. diff --git a/training/wordlist2dawg.cpp b/training/wordlist2dawg.cpp index 17fef5a56..8812df8ec 100644 --- a/training/wordlist2dawg.cpp +++ b/training/wordlist2dawg.cpp @@ -32,8 +32,6 @@ #include "trie.h" #include "unicharset.h" -static const int kMaxNumEdges = 30000000; - int main(int argc, char** argv) { if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) || (argc == 6 && strcmp(argv[1], "-r") == 0))) { @@ -69,8 +67,7 @@ int main(int argc, char** argv) { tesseract::Trie trie( // the first 3 arguments are not used in this case tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, - kMaxNumEdges, unicharset.size(), - classify->getDict().dawg_debug_level); + unicharset.size(), classify->getDict().dawg_debug_level); tprintf("Reading word list from '%s'\n", wordlist_filename); if (!trie.read_and_add_word_list(wordlist_filename, unicharset, reverse_policy)) {