Removed upper limit on trie size, fixing issue 1020.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2014-02-03 19:18:23 +00:00
parent df80e9dc59
commit 07ca24aeaf
7 changed files with 6 additions and 21 deletions

View File

@ -53,7 +53,6 @@ bool WordListLangModel::Init() {
// The last parameter to the Trie constructor (the debug level) is set to // The last parameter to the Trie constructor (the debug level) is set to
// false for now, until Cube has a way to express its preferred debug level. // false for now, until Cube has a way to express its preferred debug level.
dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM, dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
WordListLangModel::kMaxDawgEdges,
cntxt_->CharacterSet()->ClassCount(), false); cntxt_->CharacterSet()->ClassCount(), false);
if (dawg_ == NULL) { if (dawg_ == NULL) {
return false; return false;

View File

@ -69,7 +69,6 @@ class WordListLangModel : public LangModel {
private: private:
// constants needed to configure the language model // constants needed to configure the language model
static const int kMaxEdge = 512; static const int kMaxEdge = 512;
static const int kMaxDawgEdges = 20000;
CubeRecoContext *cntxt_; CubeRecoContext *cntxt_;
Trie *dawg_; Trie *dawg_;

View File

@ -239,8 +239,7 @@ void Dict::Load(DawgCache *dawg_cache) {
if (((STRING &)user_words_suffix).length() > 0) { if (((STRING &)user_words_suffix).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
kMaxUserDawgEdges, getUnicharset().size(), getUnicharset().size(), dawg_debug_level);
dawg_debug_level);
name = getCCUtil()->language_data_path_prefix; name = getCCUtil()->language_data_path_prefix;
name += user_words_suffix; name += user_words_suffix;
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
@ -254,8 +253,7 @@ void Dict::Load(DawgCache *dawg_cache) {
if (((STRING &)user_patterns_suffix).length() > 0) { if (((STRING &)user_patterns_suffix).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
kMaxUserDawgEdges, getUnicharset().size(), getUnicharset().size(), dawg_debug_level);
dawg_debug_level);
trie_ptr->initialize_patterns(&(getUnicharset())); trie_ptr->initialize_patterns(&(getUnicharset()));
name = getCCUtil()->language_data_path_prefix; name = getCCUtil()->language_data_path_prefix;
name += user_patterns_suffix; name += user_patterns_suffix;
@ -268,14 +266,12 @@ void Dict::Load(DawgCache *dawg_cache) {
} }
document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
kMaxDocDawgEdges, getUnicharset().size(), getUnicharset().size(), dawg_debug_level);
dawg_debug_level);
dawgs_ += document_words_; dawgs_ += document_words_;
// This dawg is temporary and should not be searched by letter_is_ok. // This dawg is temporary and should not be searched by letter_is_ok.
pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
kMaxDocDawgEdges, getUnicharset().size(), getUnicharset().size(), dawg_debug_level);
dawg_debug_level);
// Construct a list of corresponding successors for each dawg. Each entry i // Construct a list of corresponding successors for each dawg. Each entry i
// in the successors_ vector is a vector of integers that represent the // in the successors_ vector is a vector of integers that represent the

View File

@ -61,9 +61,6 @@ static const char kHyphenSymbol[] = "-";
static const char kSlashSymbol[] = "/"; static const char kSlashSymbol[] = "/";
static const char kQuestionSymbol[] = "?"; static const char kQuestionSymbol[] = "?";
static const char kApostropheSymbol[] = "'"; static const char kApostropheSymbol[] = "'";
static const int kMaxNumDawgEdgees = 2000000;
static const int kMaxDocDawgEdges = 250000;
static const int kMaxUserDawgEdges = 50000;
static const float kSimCertaintyScale = -10.0; // similarity matcher scaling static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
static const float kSimCertaintyOffset = -10.0; // similarity matcher offset static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
static const float kSimilarityFloor = 100.0; // worst E*L product to stop on static const float kSimilarityFloor = 100.0; // worst E*L product to stop on

View File

@ -125,7 +125,6 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
int direction, bool word_end, int direction, bool word_end,
UNICHAR_ID unichar_id) { UNICHAR_ID unichar_id) {
if (num_edges_ == max_num_edges_) return false;
EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
&(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
int search_index; int search_index;

View File

@ -87,10 +87,9 @@ class Trie : public Dawg {
// contain more edges than max_num_edges, all the edges are cleared // contain more edges than max_num_edges, all the edges are cleared
// so that new inserts can proceed). // so that new inserts can proceed).
Trie(DawgType type, const STRING &lang, PermuterType perm, Trie(DawgType type, const STRING &lang, PermuterType perm,
uinT64 max_num_edges, int unicharset_size, int debug_level) { int unicharset_size, int debug_level) {
init(type, lang, perm, unicharset_size, debug_level); init(type, lang, perm, unicharset_size, debug_level);
num_edges_ = 0; num_edges_ = 0;
max_num_edges_ = max_num_edges;
deref_node_index_mask_ = ~letter_mask_; deref_node_index_mask_ = ~letter_mask_;
new_dawg_node(); // need to allocate node 0 new_dawg_node(); // need to allocate node 0
initialized_patterns_ = false; initialized_patterns_ = false;
@ -415,7 +414,6 @@ class Trie : public Dawg {
// Member variables // Member variables
TRIE_NODES nodes_; // vector of nodes in the Trie TRIE_NODES nodes_; // vector of nodes in the Trie
uinT64 num_edges_; // sum of all edges (forward and backward) uinT64 num_edges_; // sum of all edges (forward and backward)
uinT64 max_num_edges_; // maximum number of edges allowed
uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
// Freelist of edges in the root backwards node that were previously zeroed. // Freelist of edges in the root backwards node that were previously zeroed.

View File

@ -32,8 +32,6 @@
#include "trie.h" #include "trie.h"
#include "unicharset.h" #include "unicharset.h"
static const int kMaxNumEdges = 30000000;
int main(int argc, char** argv) { int main(int argc, char** argv) {
if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) || if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
(argc == 6 && strcmp(argv[1], "-r") == 0))) { (argc == 6 && strcmp(argv[1], "-r") == 0))) {
@ -69,8 +67,7 @@ int main(int argc, char** argv) {
tesseract::Trie trie( tesseract::Trie trie(
// the first 3 arguments are not used in this case // the first 3 arguments are not used in this case
tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
kMaxNumEdges, unicharset.size(), unicharset.size(), classify->getDict().dawg_debug_level);
classify->getDict().dawg_debug_level);
tprintf("Reading word list from '%s'\n", wordlist_filename); tprintf("Reading word list from '%s'\n", wordlist_filename);
if (!trie.read_and_add_word_list(wordlist_filename, unicharset, if (!trie.read_and_add_word_list(wordlist_filename, unicharset,
reverse_policy)) { reverse_policy)) {