mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
Removed upper limit on trie size, fixing issue 1020.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
df80e9dc59
commit
07ca24aeaf
@ -53,7 +53,6 @@ bool WordListLangModel::Init() {
|
||||
// The last parameter to the Trie constructor (the debug level) is set to
|
||||
// false for now, until Cube has a way to express its preferred debug level.
|
||||
dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
|
||||
WordListLangModel::kMaxDawgEdges,
|
||||
cntxt_->CharacterSet()->ClassCount(), false);
|
||||
if (dawg_ == NULL) {
|
||||
return false;
|
||||
|
@ -69,7 +69,6 @@ class WordListLangModel : public LangModel {
|
||||
private:
|
||||
// constants needed to configure the language model
|
||||
static const int kMaxEdge = 512;
|
||||
static const int kMaxDawgEdges = 20000;
|
||||
|
||||
CubeRecoContext *cntxt_;
|
||||
Trie *dawg_;
|
||||
|
@ -239,8 +239,7 @@ void Dict::Load(DawgCache *dawg_cache) {
|
||||
|
||||
if (((STRING &)user_words_suffix).length() > 0) {
|
||||
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
||||
kMaxUserDawgEdges, getUnicharset().size(),
|
||||
dawg_debug_level);
|
||||
getUnicharset().size(), dawg_debug_level);
|
||||
name = getCCUtil()->language_data_path_prefix;
|
||||
name += user_words_suffix;
|
||||
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
|
||||
@ -254,8 +253,7 @@ void Dict::Load(DawgCache *dawg_cache) {
|
||||
|
||||
if (((STRING &)user_patterns_suffix).length() > 0) {
|
||||
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
|
||||
kMaxUserDawgEdges, getUnicharset().size(),
|
||||
dawg_debug_level);
|
||||
getUnicharset().size(), dawg_debug_level);
|
||||
trie_ptr->initialize_patterns(&(getUnicharset()));
|
||||
name = getCCUtil()->language_data_path_prefix;
|
||||
name += user_patterns_suffix;
|
||||
@ -268,14 +266,12 @@ void Dict::Load(DawgCache *dawg_cache) {
|
||||
}
|
||||
|
||||
document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
|
||||
kMaxDocDawgEdges, getUnicharset().size(),
|
||||
dawg_debug_level);
|
||||
getUnicharset().size(), dawg_debug_level);
|
||||
dawgs_ += document_words_;
|
||||
|
||||
// This dawg is temporary and should not be searched by letter_is_ok.
|
||||
pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
|
||||
kMaxDocDawgEdges, getUnicharset().size(),
|
||||
dawg_debug_level);
|
||||
getUnicharset().size(), dawg_debug_level);
|
||||
|
||||
// Construct a list of corresponding successors for each dawg. Each entry i
|
||||
// in the successors_ vector is a vector of integers that represent the
|
||||
|
@ -61,9 +61,6 @@ static const char kHyphenSymbol[] = "-";
|
||||
static const char kSlashSymbol[] = "/";
|
||||
static const char kQuestionSymbol[] = "?";
|
||||
static const char kApostropheSymbol[] = "'";
|
||||
static const int kMaxNumDawgEdgees = 2000000;
|
||||
static const int kMaxDocDawgEdges = 250000;
|
||||
static const int kMaxUserDawgEdges = 50000;
|
||||
static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
|
||||
static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
|
||||
static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
|
||||
|
@ -125,7 +125,6 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
|
||||
bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
|
||||
int direction, bool word_end,
|
||||
UNICHAR_ID unichar_id) {
|
||||
if (num_edges_ == max_num_edges_) return false;
|
||||
EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
|
||||
&(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
|
||||
int search_index;
|
||||
|
@ -87,10 +87,9 @@ class Trie : public Dawg {
|
||||
// contain more edges than max_num_edges, all the edges are cleared
|
||||
// so that new inserts can proceed).
|
||||
Trie(DawgType type, const STRING &lang, PermuterType perm,
|
||||
uinT64 max_num_edges, int unicharset_size, int debug_level) {
|
||||
int unicharset_size, int debug_level) {
|
||||
init(type, lang, perm, unicharset_size, debug_level);
|
||||
num_edges_ = 0;
|
||||
max_num_edges_ = max_num_edges;
|
||||
deref_node_index_mask_ = ~letter_mask_;
|
||||
new_dawg_node(); // need to allocate node 0
|
||||
initialized_patterns_ = false;
|
||||
@ -415,7 +414,6 @@ class Trie : public Dawg {
|
||||
// Member variables
|
||||
TRIE_NODES nodes_; // vector of nodes in the Trie
|
||||
uinT64 num_edges_; // sum of all edges (forward and backward)
|
||||
uinT64 max_num_edges_; // maximum number of edges allowed
|
||||
uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
|
||||
uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
|
||||
// Freelist of edges in the root backwards node that were previously zeroed.
|
||||
|
@ -32,8 +32,6 @@
|
||||
#include "trie.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
static const int kMaxNumEdges = 30000000;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
|
||||
(argc == 6 && strcmp(argv[1], "-r") == 0))) {
|
||||
@ -69,8 +67,7 @@ int main(int argc, char** argv) {
|
||||
tesseract::Trie trie(
|
||||
// the first 3 arguments are not used in this case
|
||||
tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
|
||||
kMaxNumEdges, unicharset.size(),
|
||||
classify->getDict().dawg_debug_level);
|
||||
unicharset.size(), classify->getDict().dawg_debug_level);
|
||||
tprintf("Reading word list from '%s'\n", wordlist_filename);
|
||||
if (!trie.read_and_add_word_list(wordlist_filename, unicharset,
|
||||
reverse_policy)) {
|
||||
|
Loading…
Reference in New Issue
Block a user