From 07ca24aeaffc44140094b3b93cb51d22e39f4dba Mon Sep 17 00:00:00 2001
From: "theraysmith@gmail.com"
 <theraysmith@gmail.com@d0cd1f9f-072b-0410-8dd7-cf729c803f20>
Date: Mon, 3 Feb 2014 19:18:23 +0000
Subject: [PATCH] Removed upper limit on trie size, fixing issue 1020.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20
---
 cube/word_list_lang_model.cpp |  1 -
 cube/word_list_lang_model.h   |  1 -
 dict/dict.cpp                 | 12 ++++--------
 dict/dict.h                   |  3 ---
 dict/trie.cpp                 |  1 -
 dict/trie.h                   |  4 +---
 training/wordlist2dawg.cpp    |  5 +----
 7 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/cube/word_list_lang_model.cpp b/cube/word_list_lang_model.cpp
index 8537d256a..67a6a5a98 100644
--- a/cube/word_list_lang_model.cpp
+++ b/cube/word_list_lang_model.cpp
@@ -53,7 +53,6 @@ bool WordListLangModel::Init() {
   // The last parameter to the Trie constructor (the debug level) is set to
   // false for now, until Cube has a way to express its preferred debug level.
   dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
-                   WordListLangModel::kMaxDawgEdges,
                    cntxt_->CharacterSet()->ClassCount(), false);
   if (dawg_ == NULL) {
     return false;
diff --git a/cube/word_list_lang_model.h b/cube/word_list_lang_model.h
index a975e3c5c..099d62949 100644
--- a/cube/word_list_lang_model.h
+++ b/cube/word_list_lang_model.h
@@ -69,7 +69,6 @@ class WordListLangModel : public LangModel {
  private:
   // constants needed to configure the language model
   static const int kMaxEdge = 512;
-  static const int kMaxDawgEdges = 20000;
 
   CubeRecoContext *cntxt_;
   Trie *dawg_;
diff --git a/dict/dict.cpp b/dict/dict.cpp
index 0d652514d..6165eb062 100644
--- a/dict/dict.cpp
+++ b/dict/dict.cpp
@@ -239,8 +239,7 @@ void Dict::Load(DawgCache *dawg_cache) {
 
   if (((STRING &)user_words_suffix).length() > 0) {
     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
-                              kMaxUserDawgEdges, getUnicharset().size(),
-                              dawg_debug_level);
+                              getUnicharset().size(), dawg_debug_level);
     name = getCCUtil()->language_data_path_prefix;
     name += user_words_suffix;
     if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
@@ -254,8 +253,7 @@ void Dict::Load(DawgCache *dawg_cache) {
 
   if (((STRING &)user_patterns_suffix).length() > 0) {
     Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
-                              kMaxUserDawgEdges, getUnicharset().size(),
-                              dawg_debug_level);
+                              getUnicharset().size(), dawg_debug_level);
     trie_ptr->initialize_patterns(&(getUnicharset()));
     name = getCCUtil()->language_data_path_prefix;
     name += user_patterns_suffix;
@@ -268,14 +266,12 @@ void Dict::Load(DawgCache *dawg_cache) {
   }
 
   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
-                             kMaxDocDawgEdges, getUnicharset().size(),
-                             dawg_debug_level);
+                             getUnicharset().size(), dawg_debug_level);
   dawgs_ += document_words_;
 
   // This dawg is temporary and should not be searched by letter_is_ok.
   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
-                            kMaxDocDawgEdges, getUnicharset().size(),
-                            dawg_debug_level);
+                            getUnicharset().size(), dawg_debug_level);
 
   // Construct a list of corresponding successors for each dawg. Each entry i
   // in the successors_ vector is a vector of integers that represent the
diff --git a/dict/dict.h b/dict/dict.h
index 2e53ad56a..e95a3e5d7 100644
--- a/dict/dict.h
+++ b/dict/dict.h
@@ -61,9 +61,6 @@ static const char kHyphenSymbol[] = "-";
 static const char kSlashSymbol[] = "/";
 static const char kQuestionSymbol[] = "?";
 static const char kApostropheSymbol[] = "'";
-static const int kMaxNumDawgEdgees = 2000000;
-static const int kMaxDocDawgEdges = 250000;
-static const int kMaxUserDawgEdges = 50000;
 static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling
 static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset
 static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on
diff --git a/dict/trie.cpp b/dict/trie.cpp
index 189c8c9b7..e2fd49085 100644
--- a/dict/trie.cpp
+++ b/dict/trie.cpp
@@ -125,7 +125,6 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
                             int direction, bool word_end,
                             UNICHAR_ID unichar_id) {
-  if (num_edges_ == max_num_edges_) return false;
   EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
     &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
   int search_index;
diff --git a/dict/trie.h b/dict/trie.h
index ba60b4e25..4a89debfb 100644
--- a/dict/trie.h
+++ b/dict/trie.h
@@ -87,10 +87,9 @@ class Trie : public Dawg {
   // contain more edges than max_num_edges, all the edges are cleared
   // so that new inserts can proceed).
   Trie(DawgType type, const STRING &lang, PermuterType perm,
-       uinT64 max_num_edges, int unicharset_size, int debug_level) {
+       int unicharset_size, int debug_level) {
     init(type, lang, perm, unicharset_size, debug_level);
     num_edges_ = 0;
-    max_num_edges_ = max_num_edges;
     deref_node_index_mask_ = ~letter_mask_;
     new_dawg_node();  // need to allocate node 0
     initialized_patterns_ = false;
@@ -415,7 +414,6 @@ class Trie : public Dawg {
   // Member variables
   TRIE_NODES nodes_;              // vector of nodes in the Trie
   uinT64 num_edges_;              // sum of all edges (forward and backward)
-  uinT64 max_num_edges_;          // maximum number of edges allowed
   uinT64 deref_direction_mask_;   // mask for EDGE_REF to extract direction
   uinT64 deref_node_index_mask_;  // mask for EDGE_REF to extract node index
   // Freelist of edges in the root backwards node that were previously zeroed.
diff --git a/training/wordlist2dawg.cpp b/training/wordlist2dawg.cpp
index 17fef5a56..8812df8ec 100644
--- a/training/wordlist2dawg.cpp
+++ b/training/wordlist2dawg.cpp
@@ -32,8 +32,6 @@
 #include "trie.h"
 #include "unicharset.h"
 
-static const int kMaxNumEdges =  30000000;
-
 int main(int argc, char** argv) {
   if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
       (argc == 6 && strcmp(argv[1], "-r") == 0))) {
@@ -69,8 +67,7 @@ int main(int argc, char** argv) {
     tesseract::Trie trie(
         // the first 3 arguments are not used in this case
         tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
-        kMaxNumEdges, unicharset.size(),
-        classify->getDict().dawg_debug_level);
+        unicharset.size(), classify->getDict().dawg_debug_level);
     tprintf("Reading word list from '%s'\n", wordlist_filename);
     if (!trie.read_and_add_word_list(wordlist_filename, unicharset,
                                      reverse_policy)) {