mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
Removed upper limit on trie size, fixing issue 1020.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
df80e9dc59
commit
07ca24aeaf
@ -53,7 +53,6 @@ bool WordListLangModel::Init() {
|
|||||||
// The last parameter to the Trie constructor (the debug level) is set to
|
// The last parameter to the Trie constructor (the debug level) is set to
|
||||||
// false for now, until Cube has a way to express its preferred debug level.
|
// false for now, until Cube has a way to express its preferred debug level.
|
||||||
dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
|
dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
|
||||||
WordListLangModel::kMaxDawgEdges,
|
|
||||||
cntxt_->CharacterSet()->ClassCount(), false);
|
cntxt_->CharacterSet()->ClassCount(), false);
|
||||||
if (dawg_ == NULL) {
|
if (dawg_ == NULL) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -69,7 +69,6 @@ class WordListLangModel : public LangModel {
|
|||||||
private:
|
private:
|
||||||
// constants needed to configure the language model
|
// constants needed to configure the language model
|
||||||
static const int kMaxEdge = 512;
|
static const int kMaxEdge = 512;
|
||||||
static const int kMaxDawgEdges = 20000;
|
|
||||||
|
|
||||||
CubeRecoContext *cntxt_;
|
CubeRecoContext *cntxt_;
|
||||||
Trie *dawg_;
|
Trie *dawg_;
|
||||||
|
@ -239,8 +239,7 @@ void Dict::Load(DawgCache *dawg_cache) {
|
|||||||
|
|
||||||
if (((STRING &)user_words_suffix).length() > 0) {
|
if (((STRING &)user_words_suffix).length() > 0) {
|
||||||
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
||||||
kMaxUserDawgEdges, getUnicharset().size(),
|
getUnicharset().size(), dawg_debug_level);
|
||||||
dawg_debug_level);
|
|
||||||
name = getCCUtil()->language_data_path_prefix;
|
name = getCCUtil()->language_data_path_prefix;
|
||||||
name += user_words_suffix;
|
name += user_words_suffix;
|
||||||
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
|
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
|
||||||
@ -254,8 +253,7 @@ void Dict::Load(DawgCache *dawg_cache) {
|
|||||||
|
|
||||||
if (((STRING &)user_patterns_suffix).length() > 0) {
|
if (((STRING &)user_patterns_suffix).length() > 0) {
|
||||||
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
|
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
|
||||||
kMaxUserDawgEdges, getUnicharset().size(),
|
getUnicharset().size(), dawg_debug_level);
|
||||||
dawg_debug_level);
|
|
||||||
trie_ptr->initialize_patterns(&(getUnicharset()));
|
trie_ptr->initialize_patterns(&(getUnicharset()));
|
||||||
name = getCCUtil()->language_data_path_prefix;
|
name = getCCUtil()->language_data_path_prefix;
|
||||||
name += user_patterns_suffix;
|
name += user_patterns_suffix;
|
||||||
@ -268,14 +266,12 @@ void Dict::Load(DawgCache *dawg_cache) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
|
document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
|
||||||
kMaxDocDawgEdges, getUnicharset().size(),
|
getUnicharset().size(), dawg_debug_level);
|
||||||
dawg_debug_level);
|
|
||||||
dawgs_ += document_words_;
|
dawgs_ += document_words_;
|
||||||
|
|
||||||
// This dawg is temporary and should not be searched by letter_is_ok.
|
// This dawg is temporary and should not be searched by letter_is_ok.
|
||||||
pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
|
pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
|
||||||
kMaxDocDawgEdges, getUnicharset().size(),
|
getUnicharset().size(), dawg_debug_level);
|
||||||
dawg_debug_level);
|
|
||||||
|
|
||||||
// Construct a list of corresponding successors for each dawg. Each entry i
|
// Construct a list of corresponding successors for each dawg. Each entry i
|
||||||
// in the successors_ vector is a vector of integers that represent the
|
// in the successors_ vector is a vector of integers that represent the
|
||||||
|
@ -61,9 +61,6 @@ static const char kHyphenSymbol[] = "-";
|
|||||||
static const char kSlashSymbol[] = "/";
|
static const char kSlashSymbol[] = "/";
|
||||||
static const char kQuestionSymbol[] = "?";
|
static const char kQuestionSymbol[] = "?";
|
||||||
static const char kApostropheSymbol[] = "'";
|
static const char kApostropheSymbol[] = "'";
|
||||||
static const int kMaxNumDawgEdgees = 2000000;
|
|
||||||
static const int kMaxDocDawgEdges = 250000;
|
|
||||||
static const int kMaxUserDawgEdges = 50000;
|
|
||||||
static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
|
static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
|
||||||
static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
|
static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
|
||||||
static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
|
static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
|
||||||
|
@ -125,7 +125,6 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
|
|||||||
bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
|
bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
|
||||||
int direction, bool word_end,
|
int direction, bool word_end,
|
||||||
UNICHAR_ID unichar_id) {
|
UNICHAR_ID unichar_id) {
|
||||||
if (num_edges_ == max_num_edges_) return false;
|
|
||||||
EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
|
EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
|
||||||
&(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
|
&(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
|
||||||
int search_index;
|
int search_index;
|
||||||
|
@ -87,10 +87,9 @@ class Trie : public Dawg {
|
|||||||
// contain more edges than max_num_edges, all the edges are cleared
|
// contain more edges than max_num_edges, all the edges are cleared
|
||||||
// so that new inserts can proceed).
|
// so that new inserts can proceed).
|
||||||
Trie(DawgType type, const STRING &lang, PermuterType perm,
|
Trie(DawgType type, const STRING &lang, PermuterType perm,
|
||||||
uinT64 max_num_edges, int unicharset_size, int debug_level) {
|
int unicharset_size, int debug_level) {
|
||||||
init(type, lang, perm, unicharset_size, debug_level);
|
init(type, lang, perm, unicharset_size, debug_level);
|
||||||
num_edges_ = 0;
|
num_edges_ = 0;
|
||||||
max_num_edges_ = max_num_edges;
|
|
||||||
deref_node_index_mask_ = ~letter_mask_;
|
deref_node_index_mask_ = ~letter_mask_;
|
||||||
new_dawg_node(); // need to allocate node 0
|
new_dawg_node(); // need to allocate node 0
|
||||||
initialized_patterns_ = false;
|
initialized_patterns_ = false;
|
||||||
@ -415,7 +414,6 @@ class Trie : public Dawg {
|
|||||||
// Member variables
|
// Member variables
|
||||||
TRIE_NODES nodes_; // vector of nodes in the Trie
|
TRIE_NODES nodes_; // vector of nodes in the Trie
|
||||||
uinT64 num_edges_; // sum of all edges (forward and backward)
|
uinT64 num_edges_; // sum of all edges (forward and backward)
|
||||||
uinT64 max_num_edges_; // maximum number of edges allowed
|
|
||||||
uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
|
uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
|
||||||
uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
|
uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
|
||||||
// Freelist of edges in the root backwards node that were previously zeroed.
|
// Freelist of edges in the root backwards node that were previously zeroed.
|
||||||
|
@ -32,8 +32,6 @@
|
|||||||
#include "trie.h"
|
#include "trie.h"
|
||||||
#include "unicharset.h"
|
#include "unicharset.h"
|
||||||
|
|
||||||
static const int kMaxNumEdges = 30000000;
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
|
if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
|
||||||
(argc == 6 && strcmp(argv[1], "-r") == 0))) {
|
(argc == 6 && strcmp(argv[1], "-r") == 0))) {
|
||||||
@ -69,8 +67,7 @@ int main(int argc, char** argv) {
|
|||||||
tesseract::Trie trie(
|
tesseract::Trie trie(
|
||||||
// the first 3 arguments are not used in this case
|
// the first 3 arguments are not used in this case
|
||||||
tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
|
tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
|
||||||
kMaxNumEdges, unicharset.size(),
|
unicharset.size(), classify->getDict().dawg_debug_level);
|
||||||
classify->getDict().dawg_debug_level);
|
|
||||||
tprintf("Reading word list from '%s'\n", wordlist_filename);
|
tprintf("Reading word list from '%s'\n", wordlist_filename);
|
||||||
if (!trie.read_and_add_word_list(wordlist_filename, unicharset,
|
if (!trie.read_and_add_word_list(wordlist_filename, unicharset,
|
||||||
reverse_policy)) {
|
reverse_policy)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user