Removed upper limit on trie size, fixing issue 1020.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 18:02:40 +08:00 · 2014-02-03 19:18:23 +00:00 · 2014-02-03 19:18:23 +00:00 · 07ca24aeaf
commit 07ca24aeaf
parent df80e9dc59
7 changed files with 6 additions and 21 deletions
--- a/cube/word_list_lang_model.cpp
+++ b/cube/word_list_lang_model.cpp
@ -53,7 +53,6 @@ bool WordListLangModel::Init() {
  // The last parameter to the Trie constructor (the debug level) is set to
  // false for now, until Cube has a way to express its preferred debug level.
  dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
-                   WordListLangModel::kMaxDawgEdges,
                   cntxt_->CharacterSet()->ClassCount(), false);
  if (dawg_ == NULL) {
    return false;
--- a/cube/word_list_lang_model.h
+++ b/cube/word_list_lang_model.h
@ -69,7 +69,6 @@ class WordListLangModel : public LangModel {
 private:
  // constants needed to configure the language model
  static const int kMaxEdge = 512;
-  static const int kMaxDawgEdges = 20000;

  CubeRecoContext *cntxt_;
  Trie *dawg_;
--- a/dict/dict.cpp
+++ b/dict/dict.cpp
@ -239,8 +239,7 @@ void Dict::Load(DawgCache *dawg_cache) {

  if (((STRING &)user_words_suffix).length() > 0) {
    Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
-                              kMaxUserDawgEdges, getUnicharset().size(),
-                              dawg_debug_level);
+                              getUnicharset().size(), dawg_debug_level);
    name = getCCUtil()->language_data_path_prefix;
    name += user_words_suffix;
    if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
@ -254,8 +253,7 @@ void Dict::Load(DawgCache *dawg_cache) {

  if (((STRING &)user_patterns_suffix).length() > 0) {
    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
-                              kMaxUserDawgEdges, getUnicharset().size(),
-                              dawg_debug_level);
+                              getUnicharset().size(), dawg_debug_level);
    trie_ptr->initialize_patterns(&(getUnicharset()));
    name = getCCUtil()->language_data_path_prefix;
    name += user_patterns_suffix;
@ -268,14 +266,12 @@ void Dict::Load(DawgCache *dawg_cache) {
  }

  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
-                             kMaxDocDawgEdges, getUnicharset().size(),
-                             dawg_debug_level);
+                             getUnicharset().size(), dawg_debug_level);
  dawgs_ += document_words_;

  // This dawg is temporary and should not be searched by letter_is_ok.
  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
-                            kMaxDocDawgEdges, getUnicharset().size(),
-                            dawg_debug_level);
+                            getUnicharset().size(), dawg_debug_level);

  // Construct a list of corresponding successors for each dawg. Each entry i
  // in the successors_ vector is a vector of integers that represent the
--- a/dict/dict.h
+++ b/dict/dict.h
@ -61,9 +61,6 @@ static const char kHyphenSymbol[] = "-";
 static const char kSlashSymbol[] = "/";
 static const char kQuestionSymbol[] = "?";
 static const char kApostropheSymbol[] = "'";
-static const int kMaxNumDawgEdgees = 2000000;
-static const int kMaxDocDawgEdges = 250000;
-static const int kMaxUserDawgEdges = 50000;
 static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling
 static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset
 static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on
--- a/dict/trie.cpp
+++ b/dict/trie.cpp
@ -125,7 +125,6 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
                            int direction, bool word_end,
                            UNICHAR_ID unichar_id) {
-  if (num_edges_ == max_num_edges_) return false;
  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
    &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
  int search_index;
--- a/dict/trie.h
+++ b/dict/trie.h
@ -87,10 +87,9 @@ class Trie : public Dawg {
  // contain more edges than max_num_edges, all the edges are cleared
  // so that new inserts can proceed).
  Trie(DawgType type, const STRING &lang, PermuterType perm,
-       uinT64 max_num_edges, int unicharset_size, int debug_level) {
+       int unicharset_size, int debug_level) {
    init(type, lang, perm, unicharset_size, debug_level);
    num_edges_ = 0;
-    max_num_edges_ = max_num_edges;
    deref_node_index_mask_ = ~letter_mask_;
    new_dawg_node();  // need to allocate node 0
    initialized_patterns_ = false;
@ -415,7 +414,6 @@ class Trie : public Dawg {
  // Member variables
  TRIE_NODES nodes_;              // vector of nodes in the Trie
  uinT64 num_edges_;              // sum of all edges (forward and backward)
-  uinT64 max_num_edges_;          // maximum number of edges allowed
  uinT64 deref_direction_mask_;   // mask for EDGE_REF to extract direction
  uinT64 deref_node_index_mask_;  // mask for EDGE_REF to extract node index
  // Freelist of edges in the root backwards node that were previously zeroed.
--- a/training/wordlist2dawg.cpp
+++ b/training/wordlist2dawg.cpp
@ -32,8 +32,6 @@
 #include "trie.h"
 #include "unicharset.h"

-static const int kMaxNumEdges =  30000000;
-
 int main(int argc, char** argv) {
  if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
      (argc == 6 && strcmp(argv[1], "-r") == 0))) {
@ -69,8 +67,7 @@ int main(int argc, char** argv) {
    tesseract::Trie trie(
        // the first 3 arguments are not used in this case
        tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
-        kMaxNumEdges, unicharset.size(),
-        classify->getDict().dawg_debug_level);
+        unicharset.size(), classify->getDict().dawg_debug_level);
    tprintf("Reading word list from '%s'\n", wordlist_filename);
    if (!trie.read_and_add_word_list(wordlist_filename, unicharset,
                                     reverse_policy)) {