mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
Fixed endian bug in dawg reader, Added word bigram correction,
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@649 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
6e3d810c1d
commit
fdd4ffe85e
@ -98,6 +98,32 @@ int Dawg::check_for_words(const char *filename,
|
||||
return misses;
|
||||
}
|
||||
|
||||
void Dawg::iterate_words(const UNICHARSET &unicharset,
|
||||
TessCallback1<const char *> *cb) const {
|
||||
WERD_CHOICE word(&unicharset);
|
||||
iterate_words_rec(word, 0, cb);
|
||||
}
|
||||
|
||||
void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
|
||||
NODE_REF to_explore,
|
||||
TessCallback1<const char *> *cb) const {
|
||||
NodeChildVector children;
|
||||
this->unichar_ids_of(to_explore, &children);
|
||||
for (int i = 0; i < children.size(); i++) {
|
||||
WERD_CHOICE next_word(word_so_far);
|
||||
next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
|
||||
if (this->end_of_word(children[i].edge_ref)) {
|
||||
STRING s;
|
||||
next_word.string_and_lengths(&s, NULL);
|
||||
cb->Run(s.string());
|
||||
}
|
||||
NODE_REF next = next_node(children[i].edge_ref);
|
||||
if (next != 0) {
|
||||
iterate_words_rec(next_word, next, cb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
|
||||
NODE_REF node, UNICHAR_ID wildcard) const {
|
||||
EDGE_REF edge;
|
||||
@ -286,12 +312,12 @@ void SquishedDawg::read_squished_dawg(FILE *file,
|
||||
int unicharset_size;
|
||||
fread(&unicharset_size, sizeof(inT32), 1, file);
|
||||
fread(&num_edges_, sizeof(inT32), 1, file);
|
||||
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
|
||||
|
||||
if (swap) {
|
||||
unicharset_size = reverse32(unicharset_size);
|
||||
num_edges_ = reverse32(num_edges_);
|
||||
}
|
||||
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
|
||||
Dawg::init(type, lang, perm, unicharset_size, debug_level);
|
||||
|
||||
edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
|
||||
@ -318,13 +344,13 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
|
||||
|
||||
node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
|
||||
|
||||
for (edge=0; edge < num_edges_; edge++) // init all slots
|
||||
for (edge = 0; edge < num_edges_; edge++) // init all slots
|
||||
node_map [edge] = -1;
|
||||
|
||||
node_counter = num_forward_edges(0);
|
||||
|
||||
*num_nodes = 0;
|
||||
for (edge=0; edge < num_edges_; edge++) { // search all slots
|
||||
for (edge = 0; edge < num_edges_; edge++) { // search all slots
|
||||
|
||||
if (forward_edge(edge)) {
|
||||
(*num_nodes)++; // count nodes links
|
||||
@ -332,6 +358,7 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
|
||||
num_edges = num_forward_edges(edge);
|
||||
if (edge != 0) node_counter += num_edges;
|
||||
edge += num_edges;
|
||||
if (edge >= num_edges_) break;
|
||||
if (backward_edge(edge)) while (!last_edge(edge++));
|
||||
edge--;
|
||||
}
|
||||
@ -369,7 +396,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
|
||||
tprintf("%d edges in DAWG\n", num_edges);
|
||||
}
|
||||
|
||||
for (edge=0; edge<num_edges_; edge++) {
|
||||
for (edge = 0; edge < num_edges_; edge++) {
|
||||
if (forward_edge(edge)) { // write forward edges
|
||||
do {
|
||||
old_index = next_node_from_edge_rec(edges_[edge]);
|
||||
@ -379,6 +406,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
|
||||
set_next_node(edge, old_index);
|
||||
} while (!last_edge(edge++));
|
||||
|
||||
if (edge >= num_edges_) break;
|
||||
if (backward_edge(edge)) // skip back links
|
||||
while (!last_edge(edge++));
|
||||
|
||||
|
11
dict/dawg.h
11
dict/dawg.h
@ -34,6 +34,7 @@
|
||||
#include "elst.h"
|
||||
#include "ratngs.h"
|
||||
#include "params.h"
|
||||
#include "tesscallback.h"
|
||||
|
||||
#ifndef __GNUC__
|
||||
#ifdef __MSW32__
|
||||
@ -142,6 +143,11 @@ class Dawg {
|
||||
const UNICHARSET &unicharset,
|
||||
bool enable_wildcard) const;
|
||||
|
||||
// For each word in the Dawg, call the given (permanent) callback with the
|
||||
// text (UTF-8) version of the word.
|
||||
void iterate_words(const UNICHARSET &unicharset,
|
||||
TessCallback1<const char *> *cb) const;
|
||||
|
||||
// Pure virtual function that should be implemented by the derived classes.
|
||||
|
||||
/// Returns the edge that corresponds to the letter out of this node.
|
||||
@ -268,6 +274,11 @@ class Dawg {
|
||||
bool match_words(WERD_CHOICE *word, inT32 index,
|
||||
NODE_REF node, UNICHAR_ID wildcard) const;
|
||||
|
||||
// Recursively iterate over all words in a dawg (see public iterate_words).
|
||||
void iterate_words_rec(const WERD_CHOICE &word_so_far,
|
||||
NODE_REF to_explore,
|
||||
TessCallback1<const char *> *cb) const;
|
||||
|
||||
// Member Variables.
|
||||
DawgType type_;
|
||||
STRING lang_;
|
||||
|
130
dict/dict.cpp
130
dict/dict.cpp
@ -16,7 +16,10 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "dict.h"
|
||||
#include "unicodes.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable:4244) // Conversion warnings
|
||||
@ -41,6 +44,8 @@ Dict::Dict(Image* image_ptr)
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
|
||||
" patterns.", getImage()->getCCUtil()->params()),
|
||||
BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
|
||||
@ -48,6 +53,8 @@ Dict::Dict(Image* image_ptr)
|
||||
BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
|
||||
" (e.g. for non-space delimited languages)",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
|
||||
"bigrams.", getImage()->getCCUtil()->params()),
|
||||
double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
|
||||
"Score multiplier for word matches which have good case and"
|
||||
"are frequent in the given language (lower is better).",
|
||||
@ -70,6 +77,9 @@ Dict::Dict(Image* image_ptr)
|
||||
"Score multiplier for poorly cased strings that are not in"
|
||||
" the dictionary and generally look like garbage (lower is"
|
||||
" better).", getImage()->getCCUtil()->params()),
|
||||
STRING_MEMBER(output_ambig_words_file, "",
|
||||
"Output file for ambiguities found in the dictionary",
|
||||
getImage()->getCCUtil()->params()),
|
||||
INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
|
||||
", to 2 for more details, to 3 to see all the debug messages",
|
||||
getImage()->getCCUtil()->params()),
|
||||
@ -104,6 +114,12 @@ Dict::Dict(Image* image_ptr)
|
||||
"Make AcceptableChoice() always return false. Useful"
|
||||
" when there is a need to explore all segmentations",
|
||||
getImage()->getCCUtil()->params()),
|
||||
double_MEMBER(stopper_ambiguity_threshold_gain, 8.0,
|
||||
"Gain factor for ambiguity threshold.",
|
||||
getImage()->getCCUtil()->params()),
|
||||
double_MEMBER(stopper_ambiguity_threshold_offset, 1.5,
|
||||
"Certainty offset for ambiguity threshold.",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
|
||||
getImage()->getCCUtil()->params()),
|
||||
INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
|
||||
@ -130,6 +146,11 @@ Dict::Dict(Image* image_ptr)
|
||||
BOOL_MEMBER(segment_segcost_rating, 0,
|
||||
"incorporate segmentation cost in word rating?",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_MEMBER(segment_nonalphabetic_script, false,
|
||||
"Don't use any alphabetic-specific tricks."
|
||||
"Set to true in the traineddata config file for"
|
||||
" scripts that are cursive or inherently fixed-pitch",
|
||||
getImage()->getCCUtil()->params()),
|
||||
double_MEMBER(segment_reward_script, 0.95,
|
||||
"Score multipler for script consistency within a word. "
|
||||
"Being a 'reward' factor, it should be <= 1. "
|
||||
@ -144,10 +165,10 @@ Dict::Dict(Image* image_ptr)
|
||||
double_MEMBER(segment_reward_chartype, 0.97,
|
||||
"Score multipler for char type consistency within a word. ",
|
||||
getImage()->getCCUtil()->params()),
|
||||
double_MEMBER(segment_reward_ngram_best_choice, 0.99,
|
||||
"Score multipler for ngram permuter's best choice"
|
||||
" (only used in the Han script path).",
|
||||
getImage()->getCCUtil()->params()),
|
||||
double_MEMBER(segment_reward_ngram_best_choice, 0.99,
|
||||
"Score multipler for ngram permuter's best choice"
|
||||
" (only used in the Han script path).",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
|
||||
getImage()->getCCUtil()->params()),
|
||||
BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
|
||||
@ -182,14 +203,17 @@ Dict::Dict(Image* image_ptr)
|
||||
hyphen_unichar_id_ = INVALID_UNICHAR_ID;
|
||||
document_words_ = NULL;
|
||||
pending_words_ = NULL;
|
||||
bigram_dawg_ = NULL;
|
||||
freq_dawg_ = NULL;
|
||||
punc_dawg_ = NULL;
|
||||
max_fixed_length_dawgs_wdlen_ = -1;
|
||||
wordseg_rating_adjust_factor_ = -1.0f;
|
||||
output_ambig_words_file_ = NULL;
|
||||
}
|
||||
|
||||
Dict::~Dict() {
|
||||
if (hyphen_word_ != NULL) delete hyphen_word_;
|
||||
if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
|
||||
}
|
||||
|
||||
void Dict::Load() {
|
||||
@ -199,6 +223,10 @@ void Dict::Load() {
|
||||
if (dawgs_.length() != 0) this->End();
|
||||
|
||||
hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
|
||||
|
||||
LoadEquivalenceList(kHyphenLikeUTF8);
|
||||
LoadEquivalenceList(kApostropheLikeUTF8);
|
||||
|
||||
TessdataManager &tessdata_manager =
|
||||
getImage()->getCCUtil()->tessdata_manager;
|
||||
|
||||
@ -219,12 +247,26 @@ void Dict::Load() {
|
||||
new SquishedDawg(tessdata_manager.GetDataFilePtr(),
|
||||
DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
|
||||
}
|
||||
if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
|
||||
if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
|
||||
bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
|
||||
DAWG_TYPE_WORD, // doesn't actually matter.
|
||||
lang,
|
||||
COMPOUND_PERM, // doesn't actually matter.
|
||||
dawg_debug_level);
|
||||
}
|
||||
if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
|
||||
freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
|
||||
DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
|
||||
dawg_debug_level);
|
||||
dawgs_ += freq_dawg_;
|
||||
}
|
||||
if (load_unambig_dawg &&
|
||||
tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
|
||||
unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
|
||||
DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
|
||||
dawg_debug_level);
|
||||
dawgs_ += unambig_dawg_;
|
||||
}
|
||||
|
||||
if (((STRING &)user_words_suffix).length() > 0) {
|
||||
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
||||
@ -232,7 +274,8 @@ void Dict::Load() {
|
||||
dawg_debug_level);
|
||||
name = getImage()->getCCUtil()->language_data_path_prefix;
|
||||
name += user_words_suffix;
|
||||
if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
|
||||
if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
|
||||
Trie::RRP_REVERSE_IF_HAS_RTL)) {
|
||||
tprintf("Error: failed to load %s\n", name.string());
|
||||
exit(1);
|
||||
}
|
||||
@ -295,6 +338,7 @@ void Dict::End() {
|
||||
dawgs_.delete_data_pointers();
|
||||
successors_.delete_data_pointers();
|
||||
dawgs_.clear();
|
||||
delete bigram_dawg_;
|
||||
successors_.clear();
|
||||
document_words_ = NULL;
|
||||
max_fixed_length_dawgs_wdlen_ = -1;
|
||||
@ -304,12 +348,38 @@ void Dict::End() {
|
||||
}
|
||||
}
|
||||
|
||||
// Create unicharset adaptations of known, short lists of UTF-8 equivalent
|
||||
// characters (think all hyphen-like symbols). The first version of the
|
||||
// list is taken as equivalent for matching against the dictionary.
|
||||
void Dict::LoadEquivalenceList(const char *unichar_strings[]) {
|
||||
equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
|
||||
const UNICHARSET &unicharset = getUnicharset();
|
||||
GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
|
||||
for (int i = 0; unichar_strings[i] != 0; i++) {
|
||||
UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
|
||||
if (unichar_id != INVALID_UNICHAR_ID) {
|
||||
equiv_list->push_back(unichar_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize all hyphen and apostrophes to the canonicalized one for
|
||||
// matching; pass everything else through as is.
|
||||
UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const {
|
||||
for (int i = 0; i < equivalent_symbols_.size(); i++) {
|
||||
if (equivalent_symbols_[i].contains(unichar_id)) {
|
||||
return equivalent_symbols_[i][0];
|
||||
}
|
||||
}
|
||||
return unichar_id;
|
||||
}
|
||||
|
||||
// Returns true if in light of the current state unichar_id is allowed
|
||||
// according to at least one of the dawgs in the dawgs_ vector.
|
||||
// See more extensive comments in dict.h where this function is declared.
|
||||
int Dict::def_letter_is_okay(void* void_dawg_args,
|
||||
UNICHAR_ID unichar_id,
|
||||
bool word_end) {
|
||||
bool word_end) const {
|
||||
DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
|
||||
|
||||
if (dawg_debug_level >= 3) {
|
||||
@ -484,7 +554,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
|
||||
void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
|
||||
UNICHAR_ID unichar_id, bool word_end,
|
||||
DawgArgs *dawg_args,
|
||||
PermuterType *curr_perm) {
|
||||
PermuterType *curr_perm) const {
|
||||
NODE_REF node = GetStartingNode(dawg, info.ref);
|
||||
// Try to find the edge corresponding to the exact unichar_id and to all the
|
||||
// edges corresponding to the character class of unichar_id.
|
||||
@ -572,7 +642,7 @@ void Dict::WriteFixedLengthDawgs(
|
||||
// from hyphen_active_dawgs_ instead.
|
||||
void Dict::init_active_dawgs(int sought_word_length,
|
||||
DawgInfoVector *active_dawgs,
|
||||
bool ambigs_mode) {
|
||||
bool ambigs_mode) const {
|
||||
int i;
|
||||
if (sought_word_length != kAnyWordLength) {
|
||||
// Only search one fixed word length dawg.
|
||||
@ -604,7 +674,7 @@ void Dict::init_active_dawgs(int sought_word_length,
|
||||
|
||||
// If hyphenated() returns true, copy the entries from hyphen_constraints_
|
||||
// into the given constraints vector.
|
||||
void Dict::init_constraints(DawgInfoVector *constraints) {
|
||||
void Dict::init_constraints(DawgInfoVector *constraints) const {
|
||||
if (hyphenated()) {
|
||||
*constraints = hyphen_constraints_;
|
||||
if (dawg_debug_level >= 3) {
|
||||
@ -670,7 +740,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
|
||||
strcat(filename, ".doc");
|
||||
doc_word_file = open_file (filename, "a");
|
||||
fprintf(doc_word_file, "%s\n",
|
||||
best_choice.debug_string(getUnicharset()).string());
|
||||
best_choice.debug_string().string());
|
||||
fclose(doc_word_file);
|
||||
}
|
||||
document_words_->add_word_to_dawg(best_choice);
|
||||
@ -693,7 +763,7 @@ void Dict::adjust_word(WERD_CHOICE *word,
|
||||
float new_rating = word->rating();
|
||||
if (debug) {
|
||||
tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
|
||||
word->debug_string(getUnicharset()).string(), word->rating());
|
||||
word->debug_string().string(), word->rating());
|
||||
}
|
||||
new_rating += kRatingPad;
|
||||
if (nonword) { // non-dictionary word
|
||||
@ -733,9 +803,9 @@ void Dict::adjust_word(WERD_CHOICE *word,
|
||||
LogNewChoice(adjust_factor, certainty_array, false, word);
|
||||
}
|
||||
|
||||
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
|
||||
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
|
||||
const WERD_CHOICE *word_ptr = &word;
|
||||
WERD_CHOICE temp_word;
|
||||
WERD_CHOICE temp_word(word.unicharset());
|
||||
if (hyphenated()) {
|
||||
copy_hyphen_info(&temp_word);
|
||||
temp_word += word;
|
||||
@ -775,10 +845,40 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
|
||||
dawg_args.permuter : NO_PERM;
|
||||
}
|
||||
|
||||
bool Dict::valid_bigram(const WERD_CHOICE &word1,
|
||||
const WERD_CHOICE &word2) const {
|
||||
if (bigram_dawg_ == NULL) return false;
|
||||
|
||||
// Extract the core word from the middle of each word with any digits
|
||||
// replaced with question marks.
|
||||
int w1start, w1end, w2start, w2end;
|
||||
word1.punct_stripped(&w1start, &w1end);
|
||||
word2.punct_stripped(&w2start, &w2end);
|
||||
|
||||
// We don't want to penalize a single guillemet, hyphen, etc.
|
||||
// But our bigram list doesn't have any information about punctuation.
|
||||
if (w1start >= w1end) return word1.length() < 3;
|
||||
if (w2start >= w2end) return word2.length() < 3;
|
||||
|
||||
const UNICHARSET& uchset = getUnicharset();
|
||||
STRING bigram_string;
|
||||
for (int i = w1start; i < w1end; i++) {
|
||||
UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i));
|
||||
bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
|
||||
}
|
||||
bigram_string += " ";
|
||||
for (int i = w2start; i < w2end; i++) {
|
||||
UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i));
|
||||
bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
|
||||
}
|
||||
WERD_CHOICE normalized_word(bigram_string.string(), uchset);
|
||||
return bigram_dawg_->word_in_dawg(normalized_word);
|
||||
}
|
||||
|
||||
bool Dict::valid_punctuation(const WERD_CHOICE &word) {
|
||||
if (word.length() == 0) return NO_PERM;
|
||||
int i;
|
||||
WERD_CHOICE new_word;
|
||||
WERD_CHOICE new_word(word.unicharset());
|
||||
int last_index = word.length() - 1;
|
||||
int new_len = 0;
|
||||
for (i = 0; i <= last_index; ++i) {
|
||||
|
105
dict/dict.h
105
dict/dict.h
@ -89,16 +89,17 @@ struct DawgArgs {
|
||||
|
||||
class Dict {
|
||||
public:
|
||||
// Gain factor for ambiguity threshold.
|
||||
static const float kStopperAmbiguityThresholdGain;
|
||||
// Certainty offset for ambiguity threshold.
|
||||
static const float kStopperAmbiguityThresholdOffset;
|
||||
|
||||
Dict(Image* image_ptr);
|
||||
~Dict();
|
||||
const Image* getImage() const {
|
||||
return image_ptr_;
|
||||
}
|
||||
Image* getImage() {
|
||||
return image_ptr_;
|
||||
}
|
||||
const UNICHARSET& getUnicharset() const {
|
||||
return getImage()->getCCUtil()->unicharset;
|
||||
}
|
||||
UNICHARSET& getUnicharset() {
|
||||
return getImage()->getCCUtil()->unicharset;
|
||||
}
|
||||
@ -114,17 +115,17 @@ class Dict {
|
||||
/* hyphen.cpp ************************************************************/
|
||||
|
||||
/// Returns true if we've recorded the beginning of a hyphenated word.
|
||||
inline bool hyphenated() { return
|
||||
inline bool hyphenated() const { return
|
||||
!last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
|
||||
}
|
||||
/// Size of the base word (the part on the line before) of a hyphenated word.
|
||||
inline int hyphen_base_size() {
|
||||
inline int hyphen_base_size() const {
|
||||
return this->hyphenated() ? hyphen_word_->length() : 0;
|
||||
}
|
||||
/// If this word is hyphenated copy the base word (the part on
|
||||
/// the line before) of a hyphenated word into the given word.
|
||||
/// This function assumes that word is not NULL.
|
||||
inline void copy_hyphen_info(WERD_CHOICE *word) {
|
||||
inline void copy_hyphen_info(WERD_CHOICE *word) const {
|
||||
if (this->hyphenated()) {
|
||||
*word = *hyphen_word_;
|
||||
if (hyphen_debug_level) word->print("copy_hyphen_info: ");
|
||||
@ -133,19 +134,19 @@ class Dict {
|
||||
/// Erase the unichar ids corresponding to the portion of the word
|
||||
/// from the previous line. The word is not changed if it is not
|
||||
/// split between lines and hyphenated.
|
||||
inline void remove_hyphen_head(WERD_CHOICE *word) {
|
||||
inline void remove_hyphen_head(WERD_CHOICE *word) const {
|
||||
if (this->hyphenated()) {
|
||||
word->remove_unichar_ids(0, hyphen_word_->length());
|
||||
if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
|
||||
}
|
||||
}
|
||||
/// Check whether the word has a hyphen at the end.
|
||||
inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) {
|
||||
inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
|
||||
return (last_word_on_line_ && !first_pos &&
|
||||
unichar_id == hyphen_unichar_id_);
|
||||
}
|
||||
/// Same as above, but check the unichar at the end of the word.
|
||||
inline bool has_hyphen_end(const WERD_CHOICE &word) {
|
||||
inline bool has_hyphen_end(const WERD_CHOICE &word) const {
|
||||
int word_index = word.length() - 1;
|
||||
return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
|
||||
}
|
||||
@ -171,12 +172,14 @@ class Dict {
|
||||
/// from hyphen_active_dawgs_ instead.
|
||||
void init_active_dawgs(int sought_word_length,
|
||||
DawgInfoVector *active_dawgs,
|
||||
bool ambigs_mode);
|
||||
bool ambigs_mode) const;
|
||||
/// If hyphenated() returns true, copy the entries from hyphen_constraints_
|
||||
/// into the given constraints vector.
|
||||
void init_constraints(DawgInfoVector *constraints);
|
||||
void init_constraints(DawgInfoVector *constraints) const;
|
||||
/// Returns true if we are operating in ambigs mode.
|
||||
inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; }
|
||||
inline bool ambigs_mode(float rating_limit) {
|
||||
return rating_limit <= 0.0;
|
||||
}
|
||||
/// Recursively explore all the possible character combinations in
|
||||
/// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
|
||||
/// dawgs in the dawgs_ vector in parallel and discard invalid words.
|
||||
@ -316,6 +319,15 @@ class Dict {
|
||||
bool fix_replaceable,
|
||||
BLOB_CHOICE_LIST_VECTOR *Choices,
|
||||
bool *modified_blobs);
|
||||
double StopperAmbigThreshold(double f1, double f2) {
|
||||
return (f2 - f1) * stopper_ambiguity_threshold_gain -
|
||||
stopper_ambiguity_threshold_offset;
|
||||
}
|
||||
// If the certainty of any chunk in Choice (item1) is not ambiguous with the
|
||||
// corresponding chunk in the best choice (item2), frees Choice and
|
||||
// returns true.
|
||||
int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice
|
||||
void *item2); // EXPANDED_CHOICE *BestChoice
|
||||
/// Replaces the corresponding wrong ngram in werd_choice with the correct
|
||||
/// one. We indicate that this newly inserted ngram unichar is composed from
|
||||
/// several fragments and modify the corresponding entries in blob_choices to
|
||||
@ -401,7 +413,7 @@ class Dict {
|
||||
/// and Certainties.
|
||||
void FillViableChoice(const WERD_CHOICE &WordChoice,
|
||||
FLOAT32 AdjustFactor, const float Certainties[],
|
||||
bool SameString, VIABLE_CHOICE ViableChoice);
|
||||
VIABLE_CHOICE ViableChoice);
|
||||
/// Returns true if there are no alternative choices for the current word
|
||||
/// or if all alternatives have an adjust factor worse than Threshold.
|
||||
bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
|
||||
@ -467,6 +479,15 @@ class Dict {
|
||||
document_words_->clear();
|
||||
}
|
||||
|
||||
// Create unicharset adaptations of known, short lists of UTF-8 equivalent
|
||||
// characters (think all hyphen-like symbols). The first version of the
|
||||
// list is taken as equivalent for matching against the dictionary.
|
||||
void LoadEquivalenceList(const char *unichar_strings[]);
|
||||
|
||||
// Normalize all hyphen and apostrophes to the canonicalized one for
|
||||
// matching; pass everything else through as is. See LoadEquivalenceList().
|
||||
UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const;
|
||||
|
||||
/**
|
||||
* Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
|
||||
* of the current state the letter at word_index in the given word
|
||||
@ -531,13 +552,13 @@ class Dict {
|
||||
|
||||
//
|
||||
int def_letter_is_okay(void* void_dawg_args,
|
||||
UNICHAR_ID unichar_id, bool word_end);
|
||||
UNICHAR_ID unichar_id, bool word_end) const;
|
||||
|
||||
int (Dict::*letter_is_okay_)(void* void_dawg_args,
|
||||
UNICHAR_ID unichar_id, bool word_end);
|
||||
UNICHAR_ID unichar_id, bool word_end) const;
|
||||
/// Calls letter_is_okay_ member function.
|
||||
int LetterIsOkay(void* void_dawg_args,
|
||||
UNICHAR_ID unichar_id, bool word_end) {
|
||||
UNICHAR_ID unichar_id, bool word_end) const {
|
||||
return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
|
||||
}
|
||||
|
||||
@ -581,6 +602,8 @@ class Dict {
|
||||
inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
|
||||
/// Return the points to the punctuation dawg.
|
||||
inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
|
||||
/// Return the points to the unambiguous words dawg.
|
||||
inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
|
||||
/// Return the pointer to the Dawg that contains words of length word_length.
|
||||
inline const Dawg *GetFixedLengthDawg(int word_length) const {
|
||||
if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
|
||||
@ -603,7 +626,7 @@ class Dict {
|
||||
/// leading punctuation is found this would ensure that we are not
|
||||
/// expecting any particular trailing punctuation after the word).
|
||||
inline bool ConstraintsOk(const DawgInfoVector &constraints,
|
||||
int word_end, DawgType current_dawg_type) {
|
||||
int word_end, DawgType current_dawg_type) const {
|
||||
if (!word_end) return true;
|
||||
if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
|
||||
for (int c = 0; c < constraints.length(); ++c) {
|
||||
@ -627,7 +650,8 @@ class Dict {
|
||||
/// edges were found.
|
||||
void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
|
||||
UNICHAR_ID unichar_id, bool word_end,
|
||||
DawgArgs *dawg_args, PermuterType *current_permuter);
|
||||
DawgArgs *dawg_args,
|
||||
PermuterType *current_permuter) const;
|
||||
|
||||
/// Read/Write/Access special purpose dawgs which contain words
|
||||
/// only of a certain length (used for phrase search for
|
||||
@ -649,23 +673,25 @@ class Dict {
|
||||
int num_dawgs, int debug_level, FILE *output_file);
|
||||
|
||||
/// Check all the DAWGs to see if this word is in any of them.
|
||||
inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
|
||||
inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
|
||||
return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
|
||||
perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
|
||||
perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
|
||||
}
|
||||
int valid_word(const WERD_CHOICE &word, bool numbers_ok);
|
||||
int valid_word(const WERD_CHOICE &word) {
|
||||
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
|
||||
int valid_word(const WERD_CHOICE &word) const {
|
||||
return valid_word(word, false); // return NO_PERM for words with digits
|
||||
}
|
||||
int valid_word_or_number(const WERD_CHOICE &word) {
|
||||
int valid_word_or_number(const WERD_CHOICE &word) const {
|
||||
return valid_word(word, true); // return NUMBER_PERM for valid numbers
|
||||
}
|
||||
/// This function is used by api/tesseract_cube_combiner.cpp
|
||||
int valid_word(const char *string) {
|
||||
int valid_word(const char *string) const {
|
||||
WERD_CHOICE word(string, getUnicharset());
|
||||
return valid_word(word);
|
||||
}
|
||||
// Do the two WERD_CHOICEs form a meaningful bigram?
|
||||
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
|
||||
/// Returns true if the word contains a valid punctuation pattern.
|
||||
/// Note: Since the domains of punctuation symbols and symblos
|
||||
/// used in numbers are not disjoint, a valid number might contain
|
||||
@ -691,6 +717,8 @@ class Dict {
|
||||
inline void SetWordsegRatingAdjustFactor(float f) {
|
||||
wordseg_rating_adjust_factor_ = f;
|
||||
}
|
||||
// Accessor for best_choices_.
|
||||
const LIST &getBestChoices() { return best_choices_; }
|
||||
|
||||
private:
|
||||
/** Private member variables. */
|
||||
@ -723,15 +751,27 @@ class Dict {
|
||||
DawgInfoVector hyphen_active_dawgs_;
|
||||
DawgInfoVector hyphen_constraints_;
|
||||
bool last_word_on_line_;
|
||||
// List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
|
||||
// matching. The first member of each list is taken as canonical. For
|
||||
// example, the first list contains hyphens and dashes with the first symbol
|
||||
// being the ASCII hyphen minus.
|
||||
GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
|
||||
// Dawgs.
|
||||
DawgVector dawgs_;
|
||||
SuccessorListsVector successors_;
|
||||
Trie *pending_words_;
|
||||
// bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
|
||||
// any of them are present on the best choices list for a word pair.
|
||||
// the bigrams are stored as space-separated words where:
|
||||
// (1) leading and trailing punctuation has been removed from each word and
|
||||
// (2) any digits have been replaced with '?' marks.
|
||||
Dawg *bigram_dawg_;
|
||||
/// The following pointers are only cached for convenience.
|
||||
/// The dawgs will be deleted when dawgs_ vector is destroyed.
|
||||
// TODO(daria): need to support multiple languages in the future,
|
||||
// so maybe will need to maintain a list of dawgs of each kind.
|
||||
Dawg *freq_dawg_;
|
||||
Dawg *unambig_dawg_;
|
||||
Dawg *punc_dawg_;
|
||||
Trie *document_words_;
|
||||
/// Maximum word length of fixed-length word dawgs.
|
||||
@ -740,6 +780,8 @@ class Dict {
|
||||
/// Current segmentation cost adjust factor for word rating.
|
||||
/// See comments in incorporate_segcost.
|
||||
float wordseg_rating_adjust_factor_;
|
||||
// File for recording ambiguities discovered during dictionary search.
|
||||
FILE *output_ambig_words_file_;
|
||||
|
||||
public:
|
||||
/// Variable members.
|
||||
@ -750,11 +792,14 @@ class Dict {
|
||||
"A list of user-provided patterns.");
|
||||
BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
|
||||
BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
|
||||
BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
|
||||
BOOL_VAR_H(load_punc_dawg, true,
|
||||
"Load dawg with punctuation patterns.");
|
||||
BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
|
||||
BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
|
||||
" dawgs (e.g. for non-space delimited languages)");
|
||||
BOOL_VAR_H(load_bigram_dawg, false,
|
||||
"Load dawg with special word bigrams.");
|
||||
double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
|
||||
"Score multiplier for word matches which have good case and"
|
||||
"are frequent in the given language (lower is better).");
|
||||
@ -779,6 +824,8 @@ class Dict {
|
||||
"Score multiplier for poorly cased strings that are not in"
|
||||
" the dictionary and generally look like garbage (lower is"
|
||||
" better).");
|
||||
STRING_VAR_H(output_ambig_words_file, "",
|
||||
"Output file for ambiguities found in the dictionary");
|
||||
INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
|
||||
", to 2 for more details, to 3 to see all the debug messages");
|
||||
INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
|
||||
@ -801,6 +848,10 @@ class Dict {
|
||||
BOOL_VAR_H(stopper_no_acceptable_choices, false,
|
||||
"Make AcceptableChoice() always return false. Useful"
|
||||
" when there is a need to explore all segmentations");
|
||||
double_VAR_H(stopper_ambiguity_threshold_gain, 8.0,
|
||||
"Gain factor for ambiguity threshold.");
|
||||
double_VAR_H(stopper_ambiguity_threshold_offset, 1.5,
|
||||
"Certainty offset for ambiguity threshold.");
|
||||
BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
|
||||
INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
|
||||
STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
|
||||
@ -816,6 +867,10 @@ class Dict {
|
||||
"Turn on word script consistency permuter");
|
||||
BOOL_VAR_H(segment_segcost_rating, 0,
|
||||
"incorporate segmentation cost in word rating?");
|
||||
BOOL_VAR_H(segment_nonalphabetic_script, false,
|
||||
"Don't use any alphabetic-specific tricks."
|
||||
"Set to true in the traineddata config file for"
|
||||
" scripts that are cursive or inherently fixed-pitch");
|
||||
double_VAR_H(segment_reward_script, 0.95,
|
||||
"Score multipler for script consistency within a word. "
|
||||
"Being a 'reward' factor, it should be <= 1. "
|
||||
|
@ -51,7 +51,7 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
|
||||
const DawgInfoVector &active_dawgs,
|
||||
const DawgInfoVector &constraints) {
|
||||
if (hyphen_word_ == NULL) {
|
||||
hyphen_word_ = new WERD_CHOICE();
|
||||
hyphen_word_ = new WERD_CHOICE(word.unicharset());
|
||||
hyphen_word_->make_bad();
|
||||
}
|
||||
if (hyphen_word_->rating() > word.rating()) {
|
||||
|
@ -28,7 +28,7 @@
|
||||
/* define the maximum number of classes defined for any matcher
|
||||
and the maximum class id for any matcher. This must be changed
|
||||
if more different classes need to be classified */
|
||||
#define MAX_NUM_CLASSES 8192
|
||||
#define MAX_NUM_CLASSES 12288
|
||||
#define MAX_CLASS_ID (MAX_NUM_CLASSES - 1)
|
||||
|
||||
/** a CLASS_ID is the ascii character to be associated with a class */
|
||||
|
@ -86,7 +86,7 @@ void Dict::go_deeper_dawg_fxn(
|
||||
if (permute_debug && dawg_debug_level) {
|
||||
tprintf("early pruned word rating=%4.2f,"
|
||||
" permdawg_limit=%4.2f, word=%s\n", word->rating(),
|
||||
permdawg_limit, word->debug_string(getUnicharset()).string());
|
||||
permdawg_limit, word->debug_string().string());
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -106,8 +106,7 @@ void Dict::go_deeper_dawg_fxn(
|
||||
}
|
||||
if (clean_active_dawgs.size() > 0) {
|
||||
if (permute_debug && dawg_debug_level)
|
||||
tprintf("new hyphen choice = %s\n",
|
||||
word->debug_string(getUnicharset()).string());
|
||||
tprintf("new hyphen choice = %s\n", word->debug_string().string());
|
||||
word->set_permuter(more_args->permuter);
|
||||
adjust_word(word, certainties, permute_debug);
|
||||
set_hyphen_word(*word, *(more_args->active_dawgs),
|
||||
@ -190,11 +189,26 @@ void Dict::go_deeper_dawg_fxn(
|
||||
// Add a new word choice
|
||||
if (word_ending) {
|
||||
if (permute_debug && dawg_debug_level) {
|
||||
tprintf("found word = %s\n",
|
||||
word->debug_string(getUnicharset()).string());
|
||||
tprintf("found word = %s\n", word->debug_string().string());
|
||||
}
|
||||
if (ambigs_mode(*limit) &&
|
||||
strcmp(output_ambig_words_file.string(), "") != 0) {
|
||||
if (output_ambig_words_file_ == NULL) {
|
||||
output_ambig_words_file_ =
|
||||
fopen(output_ambig_words_file.string(), "w+");
|
||||
if (output_ambig_words_file_ == NULL) {
|
||||
tprintf("Failed to open output_ambig_words_file %s\n",
|
||||
output_ambig_words_file.string());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
STRING word_str;
|
||||
word->string_and_lengths(&word_str, NULL);
|
||||
word_str += " ";
|
||||
fprintf(output_ambig_words_file_, word_str.string());
|
||||
}
|
||||
WERD_CHOICE *adjusted_word = word;
|
||||
WERD_CHOICE hyphen_tail_word;
|
||||
WERD_CHOICE hyphen_tail_word(&getUnicharset());
|
||||
if (hyphen_base_size() > 0) {
|
||||
hyphen_tail_word = *word;
|
||||
remove_hyphen_head(&hyphen_tail_word);
|
||||
@ -226,7 +240,7 @@ void Dict::go_deeper_dawg_fxn(
|
||||
} else {
|
||||
if (permute_debug && dawg_debug_level) {
|
||||
tprintf("last unichar not OK at index %d in %s\n",
|
||||
word_index, word->debug_string(getUnicharset()).string());
|
||||
word_index, word->debug_string().string());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -249,7 +263,7 @@ void Dict::go_deeper_dawg_fxn(
|
||||
WERD_CHOICE *Dict::dawg_permute_and_select(
|
||||
const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
|
||||
int sought_word_length, int start_char_choice_index) {
|
||||
WERD_CHOICE *best_choice = new WERD_CHOICE();
|
||||
WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
|
||||
best_choice->make_bad();
|
||||
best_choice->set_rating(rating_limit);
|
||||
if (char_choices.length() == 0) return best_choice;
|
||||
@ -272,7 +286,7 @@ WERD_CHOICE *Dict::dawg_permute_and_select(
|
||||
(segment_penalty_dict_case_bad /
|
||||
segment_penalty_dict_case_ok),
|
||||
NO_PERM, sought_word_length, end_char_choice_index);
|
||||
WERD_CHOICE word(MAX_WERD_LENGTH);
|
||||
WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
|
||||
copy_hyphen_info(&word);
|
||||
// Discard rating and certainty of the hyphen base (if any).
|
||||
word.set_rating(0.0);
|
||||
|
@ -126,12 +126,13 @@ int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) {
|
||||
* 1st choice of char 3, 2nd choice of char 4, 3rd choice of char 5, 2nd choice
|
||||
* of char 6. If n > number of choice, the closest (last) one is used.
|
||||
*/
|
||||
WERD_CHOICE* get_choice_from_posstr(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset,
|
||||
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
int start_pos,
|
||||
const char* pos_str,
|
||||
float *certainties) {
|
||||
int pos_str_len = strlen(pos_str);
|
||||
WERD_CHOICE* wchoice = new WERD_CHOICE();
|
||||
WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
|
||||
if (start_pos + pos_str_len > char_choices.length()) {
|
||||
wchoice->make_bad();
|
||||
return wchoice;
|
||||
@ -228,6 +229,7 @@ BLOB_CHOICE* find_choice_by_script(
|
||||
|
||||
|
||||
PermuterState::PermuterState() {
|
||||
unicharset_ = NULL;
|
||||
char_choices_ = NULL;
|
||||
adjust_factor_ = 1.0f;
|
||||
allow_collision_ = false;
|
||||
@ -240,6 +242,7 @@ void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
|
||||
float default_bias,
|
||||
bool debug) {
|
||||
ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH);
|
||||
unicharset_ = &unicharset;
|
||||
char_choices_ = &char_choices;
|
||||
word_length_ = char_choices.length();
|
||||
for (int i = 0; i < word_length_; ++i)
|
||||
@ -300,9 +303,8 @@ void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice,
|
||||
WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties,
|
||||
float *adjust_factor) {
|
||||
ASSERT_HOST(char_choices_ != NULL);
|
||||
WERD_CHOICE *word_choice = get_choice_from_posstr(*char_choices_,
|
||||
0, perm_state_,
|
||||
certainties);
|
||||
WERD_CHOICE *word_choice = get_choice_from_posstr(
|
||||
unicharset_, *char_choices_, 0, perm_state_, certainties);
|
||||
float rating = word_choice->rating() * adjust_factor_;
|
||||
word_choice->set_rating(rating);
|
||||
*adjust_factor = adjust_factor_;
|
||||
@ -431,7 +433,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
|
||||
if (permute_debug)
|
||||
print_char_choices_list("\n\nPermute FixedLength Word",
|
||||
char_choices, getUnicharset(), false);
|
||||
WERD_CHOICE* best_choice = new WERD_CHOICE(char_choices.length());
|
||||
WERD_CHOICE* best_choice =
|
||||
new WERD_CHOICE(&getUnicharset(), char_choices.length());
|
||||
const int max_dict_len = max_fixed_length_dawgs_wdlen_;
|
||||
const int min_dict_len = 2;
|
||||
char posstr[256];
|
||||
@ -461,7 +464,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
|
||||
}
|
||||
|
||||
if (part_choice && step > 1) { // found lexicon match
|
||||
part_choice->populate_unichars(getUnicharset());
|
||||
part_choice->populate_unichars();
|
||||
get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
|
||||
float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
|
||||
if (permuter_state)
|
||||
@ -472,8 +475,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
|
||||
part_choice->unichar_string().string());
|
||||
} else { // no lexicon match
|
||||
step = 1;
|
||||
part_choice =
|
||||
get_choice_from_posstr(char_choices, anchor_pos, "0", NULL);
|
||||
part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
|
||||
anchor_pos, "0", NULL);
|
||||
if (permute_debug)
|
||||
tprintf("Single char %d %s\n", anchor_pos,
|
||||
part_choice->unichar_string().string());
|
||||
@ -493,7 +496,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
|
||||
best_choice->rating(), match_score, adjusted_score);
|
||||
best_choice->set_rating(adjusted_score);
|
||||
}
|
||||
best_choice->populate_unichars(getUnicharset());
|
||||
best_choice->populate_unichars();
|
||||
if (permute_debug)
|
||||
tprintf("Found Best CJK word %f: %s\n",
|
||||
best_choice->rating(), best_choice->unichar_string().string());
|
||||
@ -554,11 +557,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
|
||||
print_char_choices_list("", char_choices, getUnicharset(), true);
|
||||
}
|
||||
|
||||
WERD_CHOICE *current_word = new WERD_CHOICE();
|
||||
WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
|
||||
BLOB_CHOICE_IT blob_choice_it;
|
||||
const UNICHARSET& unicharset = getUnicharset();
|
||||
bool replaced = false; // has any character choice been replaced
|
||||
int prev_unambig_type = 0; // the last chartype of an unambiguous char
|
||||
float certainties[MAX_PERM_LENGTH + 1];
|
||||
for (int x = 0; x < char_choices.length(); ++x) {
|
||||
BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
|
||||
UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
|
||||
@ -640,12 +644,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
|
||||
current_word->append_unichar_id(first_choice->unichar_id(), 1,
|
||||
first_choice->rating(),
|
||||
first_choice->certainty());
|
||||
certainties[x] = first_choice->certainty();
|
||||
}
|
||||
// All permuter choices should go through adjust_non_word so the choice
|
||||
// rating would be adjusted on the same scale.
|
||||
float certainties[MAX_PERM_LENGTH + 1];
|
||||
adjust_non_word(current_word, certainties, permute_debug);
|
||||
current_word->populate_unichars(unicharset);
|
||||
current_word->populate_unichars();
|
||||
if (replaced) {
|
||||
// Apply a reward multiplier on rating if an chartype permutation is made.
|
||||
float rating = current_word->rating();
|
||||
@ -682,10 +686,11 @@ WERD_CHOICE* Dict::permute_script_words(
|
||||
permute_debug > 1);
|
||||
}
|
||||
|
||||
WERD_CHOICE *current_word = new WERD_CHOICE();
|
||||
WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
|
||||
BLOB_CHOICE_IT blob_choice_it;
|
||||
bool replaced = false;
|
||||
bool prev_is_consistent = false;
|
||||
float certainties[MAX_PERM_LENGTH + 1];
|
||||
for (int x = 0; x < char_choices.length(); ++x) {
|
||||
blob_choice_it.set_to_list(char_choices.get(x));
|
||||
BLOB_CHOICE *first_choice = blob_choice_it.data();
|
||||
@ -737,13 +742,13 @@ WERD_CHOICE* Dict::permute_script_words(
|
||||
current_word->append_unichar_id(first_choice->unichar_id(), 1,
|
||||
first_choice->rating(),
|
||||
first_choice->certainty());
|
||||
certainties[x] = first_choice->certainty();
|
||||
prev_is_consistent = sid_consistent;
|
||||
}
|
||||
// All permuter choices should go through adjust_non_word so the choice
|
||||
// rating would be adjusted on the same scale.
|
||||
float certainties[MAX_PERM_LENGTH + 1];
|
||||
adjust_non_word(current_word, certainties, permute_debug);
|
||||
current_word->populate_unichars(getUnicharset());
|
||||
current_word->populate_unichars();
|
||||
if (replaced) {
|
||||
// Apply a reward multiplier on rating if an script permutation is made.
|
||||
float rating = current_word->rating();
|
||||
@ -780,19 +785,19 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
// Populate unichars_ and unichar_lengths_ of raw_choice. This is
|
||||
// needed for various components that still work with unichars rather
|
||||
// than unichar ids (e.g. LearnWord).
|
||||
raw_choice->populate_unichars(getUnicharset());
|
||||
raw_choice->populate_unichars();
|
||||
}
|
||||
if (this_choice && this_choice->rating() < best_choice->rating()) {
|
||||
*best_choice = *this_choice;
|
||||
// Populate unichars_ and unichar_lengths_ of best_choice. This is
|
||||
// needed for various components that still work with unichars rather
|
||||
// than unichar ids (dawg, *_ok functions, various hard-coded hacks).
|
||||
best_choice->populate_unichars(getUnicharset());
|
||||
best_choice->populate_unichars();
|
||||
|
||||
if (permute_debug) {
|
||||
best_choice->print("\n**** Populate BestChoice");
|
||||
cprintf("populate best_choice\n\t%s\n",
|
||||
best_choice->debug_string(getUnicharset()).string());
|
||||
best_choice->debug_string().string());
|
||||
}
|
||||
delete this_choice;
|
||||
return true;
|
||||
@ -811,13 +816,13 @@ WERD_CHOICE *Dict::permute_compound_words(
|
||||
float rating_limit) {
|
||||
BLOB_CHOICE *first_choice;
|
||||
WERD_CHOICE *best_choice = NULL;
|
||||
WERD_CHOICE current_word(MAX_WERD_LENGTH);
|
||||
WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
|
||||
int first_index = 0;
|
||||
int x;
|
||||
BLOB_CHOICE_IT blob_choice_it;
|
||||
|
||||
if (char_choices.length() > MAX_WERD_LENGTH) {
|
||||
WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
|
||||
WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
|
||||
bad_word_choice->make_bad();
|
||||
return bad_word_choice;
|
||||
}
|
||||
@ -874,7 +879,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
int x;
|
||||
BLOB_CHOICE_LIST_VECTOR subchoices;
|
||||
WERD_CHOICE *best_choice = NULL;
|
||||
WERD_CHOICE raw_choice;
|
||||
WERD_CHOICE raw_choice(&getUnicharset());
|
||||
raw_choice.make_bad();
|
||||
|
||||
DisableChoiceAccum();
|
||||
@ -886,7 +891,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
}
|
||||
|
||||
if (!subchoices.empty()) {
|
||||
WERD_CHOICE initial_choice;
|
||||
WERD_CHOICE initial_choice(&getUnicharset());
|
||||
initial_choice.make_bad();
|
||||
initial_choice.set_rating(rating_limit);
|
||||
|
||||
@ -906,10 +911,10 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
|
||||
if (segment_debug && current_word->rating() < MAX_FLOAT32) {
|
||||
cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
|
||||
current_word->debug_string(getUnicharset()).string(),
|
||||
current_word->debug_string().string(),
|
||||
current_word->rating(), current_word->certainty());
|
||||
}
|
||||
current_word->populate_unichars(getUnicharset());
|
||||
current_word->populate_unichars();
|
||||
|
||||
EnableChoiceAccum();
|
||||
}
|
||||
@ -919,7 +924,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
*/
|
||||
WERD_CHOICE *Dict::get_top_choice_word(
|
||||
const BLOB_CHOICE_LIST_VECTOR &char_choices) {
|
||||
WERD_CHOICE *top_word = new WERD_CHOICE(MAX_PERM_LENGTH);
|
||||
WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH);
|
||||
float certainties[MAX_PERM_LENGTH];
|
||||
top_word->set_permuter(TOP_CHOICE_PERM);
|
||||
for (int x = 0; x < char_choices.length(); x++) {
|
||||
@ -956,11 +961,11 @@ WERD_CHOICE *Dict::permute_top_choice(
|
||||
const char *next_char = ""; //next in word
|
||||
const char *next_next_char = ""; //after next next in word
|
||||
|
||||
WERD_CHOICE word(MAX_PERM_LENGTH);
|
||||
WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
|
||||
word.set_permuter(TOP_CHOICE_PERM);
|
||||
WERD_CHOICE capital_word(MAX_PERM_LENGTH);
|
||||
WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
|
||||
capital_word.set_permuter(UPPER_CASE_PERM);
|
||||
WERD_CHOICE lower_word(MAX_PERM_LENGTH);
|
||||
WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
|
||||
lower_word.set_permuter(LOWER_CASE_PERM);
|
||||
|
||||
int x;
|
||||
@ -1023,7 +1028,7 @@ WERD_CHOICE *Dict::permute_top_choice(
|
||||
if (first_choice == NULL) {
|
||||
cprintf("Permuter found only fragments for"
|
||||
" character at position %d; word=%s\n",
|
||||
x, word.debug_string(getUnicharset()).string());
|
||||
x, word.debug_string().string());
|
||||
}
|
||||
ASSERT_HOST(first_choice != NULL);
|
||||
|
||||
@ -1132,7 +1137,7 @@ WERD_CHOICE *Dict::permute_top_choice(
|
||||
}
|
||||
}
|
||||
|
||||
if (word.rating() < raw_choice->rating()) {
|
||||
if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
|
||||
*raw_choice = word;
|
||||
LogNewChoice(1.0, certainties, true, raw_choice);
|
||||
}
|
||||
@ -1423,9 +1428,9 @@ WERD_CHOICE *Dict::top_fragments_permute_and_select(
|
||||
frag_char_choices += frag_choices;
|
||||
}
|
||||
|
||||
WERD_CHOICE *best_choice = new WERD_CHOICE();
|
||||
WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
|
||||
best_choice->make_bad();
|
||||
WERD_CHOICE word(MAX_PERM_LENGTH);
|
||||
WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
|
||||
word.set_permuter(TOP_CHOICE_PERM);
|
||||
float certainties[MAX_PERM_LENGTH];
|
||||
this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
|
||||
@ -1459,7 +1464,7 @@ void Dict::permute_choices(
|
||||
tprintf("%s permute_choices: char_choice_index=%d"
|
||||
" limit=%g rating=%g, certainty=%g word=%s\n",
|
||||
debug, char_choice_index, *limit, word->rating(),
|
||||
word->certainty(), word->debug_string(getUnicharset()).string());
|
||||
word->certainty(), word->debug_string().string());
|
||||
}
|
||||
if (char_choice_index < char_choices.length()) {
|
||||
BLOB_CHOICE_IT blob_choice_it;
|
||||
@ -1554,7 +1559,7 @@ void Dict::go_deeper_top_fragments_fxn(
|
||||
if (word_ending) {
|
||||
if (fragments_debug > 1) {
|
||||
tprintf("fragments_debug new choice = %s\n",
|
||||
word->debug_string(getUnicharset()).string());
|
||||
word->debug_string().string());
|
||||
}
|
||||
*limit = word->rating();
|
||||
adjust_non_word(word, certainties, permute_debug);
|
||||
@ -1567,8 +1572,7 @@ void Dict::go_deeper_top_fragments_fxn(
|
||||
} else {
|
||||
if (fragments_debug > 1) {
|
||||
tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
|
||||
word->debug_string(getUnicharset()).string(),
|
||||
word->rating(), *limit);
|
||||
word->debug_string().string(), word->rating(), *limit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -133,6 +133,8 @@ class PermuterState {
|
||||
private:
|
||||
static const char kPosFree = '.';
|
||||
|
||||
const UNICHARSET *unicharset_;
|
||||
|
||||
const BLOB_CHOICE_LIST_VECTOR *char_choices_; // reference pointer only
|
||||
// does not need to be allocated or freed
|
||||
char perm_state_[MAX_PERM_LENGTH]; // handles upto MAX_PERM_LENGTH-1 states
|
||||
|
@ -241,6 +241,19 @@ void print_state(const char *label, STATE *state, int num_joints) {
|
||||
new_line();
|
||||
}
|
||||
|
||||
// Prints out the number of fragments in each segment in a state to
|
||||
// toappend.
|
||||
void print_state(STATE *state, int num_joints, STRING *toappend) {
|
||||
PIECES_STATE pieces;
|
||||
bin_to_pieces(state, num_joints, pieces);
|
||||
for (int i = 0; pieces[i] > 0; i++) {
|
||||
if (i > 0) {
|
||||
toappend->add_str_int(" ", pieces[i]);
|
||||
} else {
|
||||
toappend->add_str_int("", pieces[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* set_n_ones
|
||||
|
@ -29,6 +29,7 @@
|
||||
I n c l u d e s
|
||||
----------------------------------------------------------------------*/
|
||||
#include "host.h"
|
||||
#include "strngs.h"
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
T y p e s
|
||||
@ -64,6 +65,8 @@ int ones_in_state(STATE *state, int num_joints);
|
||||
|
||||
void print_state(const char *label, STATE *state, int num_joints);
|
||||
|
||||
void print_state(STATE *state, int num_joints, STRING *toappend);
|
||||
|
||||
void set_n_ones(STATE *state, int n);
|
||||
|
||||
extern void free_state(STATE *);
|
||||
|
204
dict/stopper.cpp
204
dict/stopper.cpp
@ -17,13 +17,11 @@
|
||||
******************************************************************************/
|
||||
|
||||
#include "stopper.h"
|
||||
#include "emalloc.h"
|
||||
#include "matchdefs.h"
|
||||
#include "callcpp.h"
|
||||
#include "permute.h"
|
||||
#include "danerror.h"
|
||||
#include "const.h"
|
||||
#include "freelist.h"
|
||||
#include "efio.h"
|
||||
#include "scanutils.h"
|
||||
#include "unichar.h"
|
||||
@ -58,6 +56,10 @@ typedef struct
|
||||
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
|
||||
} EXPANDED_CHOICE;
|
||||
|
||||
void DeleteViableChoiceStruct(void *vcs) {
|
||||
delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs));
|
||||
}
|
||||
|
||||
#define BestCertainty(Choices) \
|
||||
(((VIABLE_CHOICE) first_node (Choices))->Certainty)
|
||||
|
||||
@ -66,10 +68,6 @@ typedef struct
|
||||
#define BestFactor(Choices) \
|
||||
(((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
|
||||
|
||||
#define AmbigThreshold(F1,F2) \
|
||||
(((F2) - (F1)) * tesseract::Dict::kStopperAmbiguityThresholdGain - \
|
||||
tesseract::Dict::kStopperAmbiguityThresholdOffset)
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Private Code
|
||||
----------------------------------------------------------------------------**/
|
||||
@ -100,23 +98,72 @@ static void ExpandChoice(VIABLE_CHOICE Choice,
|
||||
}
|
||||
}
|
||||
|
||||
VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length)
|
||||
: Length(length) {
|
||||
Blob = new CHAR_CHOICE[length];
|
||||
segmentation_state = new uinT8[length];
|
||||
}
|
||||
|
||||
VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) {
|
||||
Blob = NULL;
|
||||
segmentation_state = NULL;
|
||||
}
|
||||
|
||||
VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() {
|
||||
delete []Blob;
|
||||
delete []segmentation_state;
|
||||
}
|
||||
|
||||
void VIABLE_CHOICE_STRUCT::Init(
|
||||
const WERD_CHOICE &word_choice,
|
||||
const PIECES_STATE &pieces_state,
|
||||
const float certainties[],
|
||||
FLOAT32 adjust_factor) {
|
||||
this->Rating = word_choice.rating();
|
||||
this->Certainty = word_choice.certainty();
|
||||
this->AdjustFactor = adjust_factor;
|
||||
this->ComposedFromCharFragments = false;
|
||||
ASSERT_HOST(this->Length == word_choice.length());
|
||||
|
||||
for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) {
|
||||
int blob_width = pieces_state[bw_idx];
|
||||
CHAR_CHOICE *blob_choice = &this->Blob[i];
|
||||
blob_choice->Class = word_choice.unichar_id(i);
|
||||
blob_choice->NumChunks = blob_width;
|
||||
blob_choice->Certainty = certainties[i];
|
||||
for (int f = 1; f < word_choice.fragment_length(i); ++f) {
|
||||
blob_width = pieces_state[++bw_idx];
|
||||
assert(blob_width > 0);
|
||||
blob_choice->NumChunks += blob_width;
|
||||
this->ComposedFromCharFragments = true;
|
||||
}
|
||||
this->segmentation_state[i] = blob_choice->NumChunks;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// If the certainty of any chunk in Choice (item1) is not ambiguous with the
|
||||
// corresponding chunk in the best choice (item2), frees Choice and
|
||||
// returns true.
|
||||
static int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice,
|
||||
void *item2) { // EXPANDED_CHOICE *BestChoice
|
||||
int Dict::FreeBadChoice(
|
||||
void *item1, // VIABLE_CHOICE Choice,
|
||||
void *item2) { // EXPANDED_CHOICE *BestChoice
|
||||
int i, j, Chunk;
|
||||
FLOAT32 Threshold;
|
||||
VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
|
||||
EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
|
||||
Threshold = AmbigThreshold(BestChoice->Choice->AdjustFactor,
|
||||
Choice->AdjustFactor);
|
||||
Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
|
||||
Choice->AdjustFactor);
|
||||
for (i = 0, Chunk = 0; i < Choice->Length; i++) {
|
||||
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++){
|
||||
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
|
||||
if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
|
||||
Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
|
||||
Threshold) {
|
||||
memfree(Choice);
|
||||
if (stopper_debug_level >= 2)
|
||||
PrintViableChoice(stderr, "\nDiscarding bad choice: ", Choice);
|
||||
delete Choice;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -124,11 +171,6 @@ static int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice,
|
||||
return false;
|
||||
}
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const float Dict::kStopperAmbiguityThresholdGain = 8.0;
|
||||
const float Dict::kStopperAmbiguityThresholdOffset = 1.5;
|
||||
|
||||
bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
|
||||
WERD_CHOICE *BestChoice,
|
||||
DANGERR *fixpt,
|
||||
@ -158,7 +200,7 @@ bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
|
||||
|
||||
if (stopper_debug_level >= 1)
|
||||
tprintf("\nStopper: %s (word=%c, case=%c)\n",
|
||||
BestChoice->debug_string(getUnicharset()).string(),
|
||||
BestChoice->debug_string().string(),
|
||||
(is_valid_word ? 'y' : 'n'),
|
||||
(is_case_ok ? 'y' : 'n'));
|
||||
|
||||
@ -198,7 +240,7 @@ bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
|
||||
|
||||
if (stopper_debug_level >= 1) {
|
||||
tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
|
||||
BestChoice.debug_string(getUnicharset()).string(),
|
||||
BestChoice.debug_string().string(),
|
||||
(valid_word(BestChoice) ? 'y' : 'n'),
|
||||
(case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
|
||||
((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
|
||||
@ -320,10 +362,16 @@ void Dict::FilterWordChoices() {
|
||||
return;
|
||||
|
||||
// Compute certainties and class for each chunk in best choice.
|
||||
ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
|
||||
|
||||
set_rest (best_choices_, delete_d(list_rest (best_choices_),
|
||||
&BestChoice, FreeBadChoice));
|
||||
VIABLE_CHOICE_STRUCT *best_choice =
|
||||
(VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
|
||||
ExpandChoice(best_choice, &BestChoice);
|
||||
if (stopper_debug_level >= 2)
|
||||
PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
|
||||
TessResultCallback2<int, void*, void*>* is_bad =
|
||||
NewPermanentTessCallback(this, &Dict::FreeBadChoice);
|
||||
set_rest(best_choices_, delete_d(list_rest(best_choices_),
|
||||
&BestChoice, is_bad));
|
||||
delete is_bad;
|
||||
}
|
||||
|
||||
void Dict::FindClassifierErrors(FLOAT32 MinRating,
|
||||
@ -371,15 +419,15 @@ void Dict::InitChoiceAccum() {
|
||||
BLOB_WIDTH *BlobWidth, *End;
|
||||
|
||||
if (best_raw_choice_)
|
||||
memfree(best_raw_choice_);
|
||||
delete best_raw_choice_;
|
||||
best_raw_choice_ = NULL;
|
||||
|
||||
if (best_choices_)
|
||||
destroy_nodes(best_choices_, memfree);
|
||||
destroy_nodes(best_choices_, DeleteViableChoiceStruct);
|
||||
best_choices_ = NIL_LIST;
|
||||
|
||||
if (raw_choices_)
|
||||
destroy_nodes(raw_choices_, memfree);
|
||||
destroy_nodes(raw_choices_, DeleteViableChoiceStruct);
|
||||
raw_choices_ = NIL_LIST;
|
||||
|
||||
EnableChoiceAccum();
|
||||
@ -391,7 +439,7 @@ void Dict::InitChoiceAccum() {
|
||||
}
|
||||
|
||||
void Dict::ClearBestChoiceAccum() {
|
||||
if (best_choices_) destroy_nodes(best_choices_, memfree);
|
||||
if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
|
||||
best_choices_ = NIL_LIST;
|
||||
}
|
||||
|
||||
@ -420,7 +468,6 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
|
||||
const float Certainties[],
|
||||
bool raw_choice,
|
||||
WERD_CHOICE *WordChoice) {
|
||||
VIABLE_CHOICE NewChoice;
|
||||
LIST ChoicesList;
|
||||
LIST Choices;
|
||||
FLOAT32 Threshold;
|
||||
@ -429,14 +476,15 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
|
||||
return;
|
||||
|
||||
if (raw_choice) {
|
||||
if (!best_raw_choice_)
|
||||
best_raw_choice_ = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
|
||||
else if (WordChoice->rating() < best_raw_choice_->Rating) {
|
||||
if (ChoiceSameAs(*WordChoice, best_raw_choice_))
|
||||
FillViableChoice(*WordChoice, AdjustFactor, Certainties, true,
|
||||
if (!best_raw_choice_) {
|
||||
best_raw_choice_ =
|
||||
NewViableChoice(*WordChoice, AdjustFactor, Certainties);
|
||||
} else if (WordChoice->rating() < best_raw_choice_->Rating) {
|
||||
if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
|
||||
FillViableChoice(*WordChoice, AdjustFactor, Certainties,
|
||||
best_raw_choice_);
|
||||
else {
|
||||
memfree(best_raw_choice_);
|
||||
} else {
|
||||
delete best_raw_choice_;
|
||||
best_raw_choice_ =
|
||||
NewViableChoice(*WordChoice, AdjustFactor, Certainties);
|
||||
}
|
||||
@ -449,16 +497,20 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
|
||||
|
||||
// Throw out obviously bad choices to save some work.
|
||||
if (ChoicesList != NIL_LIST) {
|
||||
Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
|
||||
if (Threshold > -kStopperAmbiguityThresholdOffset)
|
||||
Threshold = -kStopperAmbiguityThresholdOffset;
|
||||
Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
|
||||
if (Threshold > -stopper_ambiguity_threshold_offset)
|
||||
Threshold = -stopper_ambiguity_threshold_offset;
|
||||
if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
|
||||
// Set the rating of the word to be terrible, so that it does not
|
||||
// get chosen as the best choice.
|
||||
if (stopper_debug_level >= 2) {
|
||||
tprintf("Discarding a choice with an overly low certainty"
|
||||
" %.4f vs best choice certainty %.4f\n",
|
||||
WordChoice->certainty(), BestCertainty(ChoicesList));
|
||||
STRING bad_string;
|
||||
WordChoice->string_and_lengths(&bad_string, NULL);
|
||||
tprintf("Discarding choice \"%s\" with an overly low certainty"
|
||||
" %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
|
||||
bad_string.string(), WordChoice->certainty(),
|
||||
BestCertainty(ChoicesList),
|
||||
Threshold + BestCertainty(ChoicesList));
|
||||
}
|
||||
WordChoice->set_rating(WERD_CHOICE::kBadRating);
|
||||
return;
|
||||
@ -466,7 +518,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
|
||||
}
|
||||
|
||||
// See if a choice with the same text string has already been found.
|
||||
NewChoice = NULL;
|
||||
VIABLE_CHOICE NewChoice = NULL;
|
||||
Choices = ChoicesList;
|
||||
|
||||
iterate(Choices) {
|
||||
@ -480,11 +532,10 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
|
||||
}
|
||||
|
||||
if (NewChoice) {
|
||||
FillViableChoice(*WordChoice, AdjustFactor, Certainties, true, NewChoice);
|
||||
FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
|
||||
ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
|
||||
}
|
||||
else {
|
||||
NewChoice = NewViableChoice (*WordChoice, AdjustFactor, Certainties);
|
||||
} else {
|
||||
NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
|
||||
}
|
||||
|
||||
ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
|
||||
@ -494,7 +545,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
|
||||
if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
|
||||
Choices =
|
||||
(LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
|
||||
destroy_nodes (list_rest (Choices), Efree);
|
||||
destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct);
|
||||
set_rest(Choices, NIL_LIST);
|
||||
}
|
||||
|
||||
@ -513,7 +564,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
bool *modified_blobs) {
|
||||
if (stopper_debug_level > 2) {
|
||||
tprintf("\nRunning NoDangerousAmbig() for %s\n",
|
||||
best_choice->debug_string(getUnicharset()).string());
|
||||
best_choice->debug_string().string());
|
||||
}
|
||||
|
||||
// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
|
||||
@ -549,8 +600,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
for (i = 0; i < best_choice->length(); ++i) {
|
||||
BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
|
||||
BLOB_CHOICE_IT lst_it(lst);
|
||||
// TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
|
||||
// or are these fake ones good enough?
|
||||
lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
|
||||
0.0, 0.0, -1, -1, -1));
|
||||
0.0, 0.0, -1, -1, -1, 0, 1, false));
|
||||
ambig_blob_choices.push_back(lst);
|
||||
}
|
||||
}
|
||||
@ -630,7 +683,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
|
||||
bc_it.add_to_end(new BLOB_CHOICE(
|
||||
ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
|
||||
-1, -1, -1));
|
||||
-1, -1, -1, 0, 1, false));
|
||||
}
|
||||
}
|
||||
spec_it.forward();
|
||||
@ -650,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
} // end searching AmbigSpec_LIST
|
||||
} // end searching best_choice
|
||||
} // end searching replace and dangerous ambigs
|
||||
if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
|
||||
if (modified_best_choice) best_choice->populate_unichars();
|
||||
// If any ambiguities were found permute the constructed ambig_blob_choices
|
||||
// to see if an alternative dictionary word can be found.
|
||||
if (ambigs_found) {
|
||||
@ -666,7 +719,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
if (ambigs_found) {
|
||||
if (stopper_debug_level >= 1) {
|
||||
tprintf ("Stopper: Possible ambiguous word = %s\n",
|
||||
alt_word->debug_string(getUnicharset()).string());
|
||||
alt_word->debug_string().string());
|
||||
}
|
||||
if (fixpt != NULL) {
|
||||
// Note: Currently character choices combined from fragments can only
|
||||
@ -691,6 +744,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
}
|
||||
delete alt_word;
|
||||
}
|
||||
if (output_ambig_words_file_ != NULL) {
|
||||
fprintf(output_ambig_words_file_, "\n");
|
||||
}
|
||||
|
||||
ambig_blob_choices.delete_data_pointers();
|
||||
return !ambigs_found;
|
||||
}
|
||||
@ -714,7 +771,6 @@ void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
mem_tidy (1);
|
||||
cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
|
||||
Choice->Length, LastChunk, Blob);
|
||||
assert(false); // this should never get executed
|
||||
@ -748,7 +804,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
|
||||
for (i = 0; i < fraglen; ++i) {
|
||||
if (fraglen > 1) {
|
||||
STRING frag_str =
|
||||
CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
|
||||
CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
|
||||
getUnicharset().unichar_insert(frag_str.string());
|
||||
uch_id = getUnicharset().unichar_to_id(frag_str.string());
|
||||
}
|
||||
@ -756,7 +812,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
|
||||
STRING correct_frag_uch =
|
||||
CHAR_FRAGMENT::to_string(correct_ngram_str,
|
||||
temp_blob_index - begin_blob_index,
|
||||
num_blobs_to_replace);
|
||||
num_blobs_to_replace, false);
|
||||
getUnicharset().unichar_insert(correct_frag_uch.string());
|
||||
UNICHAR_ID correct_frag_uch_id =
|
||||
getUnicharset().unichar_to_id(correct_frag_uch.string());
|
||||
@ -825,10 +881,9 @@ VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
|
||||
const float Certainties[]) {
|
||||
int Length = WordChoice.length();
|
||||
assert (Length <= MAX_NUM_CHUNKS && Length > 0);
|
||||
VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
|
||||
sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
|
||||
FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
|
||||
return (NewChoice);
|
||||
VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
|
||||
FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
|
||||
return NewChoice;
|
||||
}
|
||||
|
||||
void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
|
||||
@ -864,35 +919,10 @@ void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice
|
||||
|
||||
void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
|
||||
FLOAT32 AdjustFactor, const float Certainties[],
|
||||
bool SameString, VIABLE_CHOICE ViableChoice) {
|
||||
CHAR_CHOICE *NewChar;
|
||||
BLOB_WIDTH *BlobWidth;
|
||||
int x;
|
||||
VIABLE_CHOICE ViableChoice) {
|
||||
ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
|
||||
AdjustFactor);
|
||||
|
||||
ViableChoice->Rating = WordChoice.rating();
|
||||
ViableChoice->Certainty = WordChoice.certainty();
|
||||
ViableChoice->AdjustFactor = AdjustFactor;
|
||||
ViableChoice->ComposedFromCharFragments = false;
|
||||
if (!SameString) {
|
||||
ViableChoice->Length = WordChoice.length();
|
||||
}
|
||||
for (x = 0,
|
||||
NewChar = &(ViableChoice->Blob[0]),
|
||||
BlobWidth = current_segmentation_;
|
||||
x < WordChoice.length();
|
||||
x++, NewChar++, Certainties++, BlobWidth++) {
|
||||
if (!SameString) {
|
||||
NewChar->Class = WordChoice.unichar_id(x);
|
||||
}
|
||||
NewChar->NumChunks = *BlobWidth;
|
||||
NewChar->Certainty = *Certainties;
|
||||
for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
|
||||
BlobWidth++;
|
||||
assert(*BlobWidth > 0);
|
||||
NewChar->NumChunks += *BlobWidth;
|
||||
ViableChoice->ComposedFromCharFragments = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
|
||||
|
@ -27,6 +27,8 @@
|
||||
#include "states.h"
|
||||
#include "unichar.h"
|
||||
|
||||
class WERD_CHOICE;
|
||||
|
||||
typedef uinT8 BLOB_WIDTH;
|
||||
|
||||
struct DANGERR_INFO {
|
||||
@ -50,13 +52,36 @@ struct CHAR_CHOICE {
|
||||
float Certainty;
|
||||
};
|
||||
|
||||
struct VIABLE_CHOICE_STRUCT {
|
||||
class VIABLE_CHOICE_STRUCT {
|
||||
public:
|
||||
VIABLE_CHOICE_STRUCT();
|
||||
explicit VIABLE_CHOICE_STRUCT(int length);
|
||||
~VIABLE_CHOICE_STRUCT();
|
||||
|
||||
// Fill in the data with these values.
|
||||
void Init(const WERD_CHOICE& word_choice,
|
||||
const PIECES_STATE& pieces_state,
|
||||
const float certainties[],
|
||||
FLOAT32 adjust_factor);
|
||||
|
||||
int Length;
|
||||
float Rating;
|
||||
float Certainty;
|
||||
FLOAT32 AdjustFactor;
|
||||
int Length;
|
||||
bool ComposedFromCharFragments;
|
||||
CHAR_CHOICE Blob[1];
|
||||
CHAR_CHOICE *Blob;
|
||||
|
||||
// segmentation_state: for each choice, how many consecutive blobs
|
||||
// does it use?
|
||||
uinT8 *segmentation_state;
|
||||
|
||||
private:
|
||||
// Disallow assignment and copy construction
|
||||
VIABLE_CHOICE_STRUCT(const VIABLE_CHOICE_STRUCT &other)
|
||||
: Length(0), Blob(NULL), segmentation_state(NULL) {}
|
||||
VIABLE_CHOICE_STRUCT &operator=(const VIABLE_CHOICE_STRUCT &other) {
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
|
||||
|
@ -40,6 +40,16 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
|
||||
const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
|
||||
const char kForceReverse[] = "RRP_FORCE_REVERSE";
|
||||
|
||||
const char * const RTLReversePolicyNames[] = {
|
||||
kDoNotReverse,
|
||||
kReverseIfHasRTL,
|
||||
kForceReverse
|
||||
};
|
||||
|
||||
const char Trie::kAlphaPatternUnicode[] = "\u2000";
|
||||
const char Trie::kDigitPatternUnicode[] = "\u2001";
|
||||
const char Trie::kAlphanumPatternUnicode[] = "\u2002";
|
||||
@ -47,6 +57,10 @@ const char Trie::kPuncPatternUnicode[] = "\u2003";
|
||||
const char Trie::kLowerPatternUnicode[] = "\u2004";
|
||||
const char Trie::kUpperPatternUnicode[] = "\u2005";
|
||||
|
||||
const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
|
||||
return RTLReversePolicyNames[reverse_policy];
|
||||
}
|
||||
|
||||
// Reset the Trie to empty.
|
||||
void Trie::clear() {
|
||||
nodes_.delete_data_pointers();
|
||||
@ -156,10 +170,15 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
|
||||
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
|
||||
}
|
||||
|
||||
void Trie::add_word_to_dawg(const WERD_CHOICE &word,
|
||||
bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
|
||||
const GenericVector<bool> *repetitions) {
|
||||
if (word.length() <= 0) return; // can't add empty words
|
||||
if (word.length() <= 0) return false; // can't add empty words
|
||||
if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
|
||||
// Make sure the word does not contain invalid unchar ids.
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
if (word.unichar_id(i) < 0 ||
|
||||
word.unichar_id(i) >= unicharset_size_) return false;
|
||||
}
|
||||
|
||||
EDGE_RECORD *edge_ptr;
|
||||
NODE_REF last_node = 0;
|
||||
@ -233,6 +252,9 @@ void Trie::add_word_to_dawg(const WERD_CHOICE &word,
|
||||
if (add_failed) {
|
||||
tprintf("Re-initializing document dictionary...\n");
|
||||
clear();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -244,7 +266,8 @@ NODE_REF Trie::new_dawg_node() {
|
||||
}
|
||||
|
||||
bool Trie::read_word_list(const char *filename,
|
||||
const UNICHARSET &unicharset) {
|
||||
const UNICHARSET &unicharset,
|
||||
Trie::RTLReversePolicy reverse_policy) {
|
||||
FILE *word_file;
|
||||
char string[CHARS_PER_LINE];
|
||||
int word_count = 0;
|
||||
@ -254,6 +277,11 @@ bool Trie::read_word_list(const char *filename,
|
||||
while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
|
||||
chomp_string(string); // remove newline
|
||||
WERD_CHOICE word(string, unicharset);
|
||||
if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
|
||||
word.has_rtl_unichar_id()) ||
|
||||
reverse_policy == RRP_FORCE_REVERSE) {
|
||||
word.reverse_and_mirror_unichar_ids();
|
||||
}
|
||||
++word_count;
|
||||
if (debug_level_ && word_count % 10000 == 0)
|
||||
tprintf("Read %d words so far\n", word_count);
|
||||
@ -290,6 +318,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
|
||||
unicharset->unichar_insert(kUpperPatternUnicode);
|
||||
upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
|
||||
initialized_patterns_ = true;
|
||||
unicharset_size_ = unicharset->size();
|
||||
}
|
||||
|
||||
void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
|
||||
@ -351,7 +380,7 @@ bool Trie::read_pattern_list(const char *filename,
|
||||
chomp_string(string); // remove newline
|
||||
// Parse the pattern and construct a unichar id vector.
|
||||
// Record the number of repetitions of each unichar in the parallel vector.
|
||||
WERD_CHOICE word;
|
||||
WERD_CHOICE word(&unicharset);
|
||||
GenericVector<bool> repetitions_vec;
|
||||
const char *str_ptr = string;
|
||||
int step = unicharset.step(str_ptr);
|
||||
@ -397,7 +426,7 @@ bool Trie::read_pattern_list(const char *filename,
|
||||
// Insert the pattern into the trie.
|
||||
if (debug_level_ > 2) {
|
||||
tprintf("Inserting expanded user pattern %s\n",
|
||||
word.debug_string(unicharset).string());
|
||||
word.debug_string().string());
|
||||
}
|
||||
if (!this->word_in_dawg(word)) {
|
||||
this->add_word_to_dawg(word, &repetitions_vec);
|
||||
|
35
dict/trie.h
35
dict/trie.h
@ -61,6 +61,12 @@ namespace tesseract {
|
||||
*/
|
||||
class Trie : public Dawg {
|
||||
public:
|
||||
enum RTLReversePolicy {
|
||||
RRP_DO_NO_REVERSE,
|
||||
RRP_REVERSE_IF_HAS_RTL,
|
||||
RRP_FORCE_REVERSE,
|
||||
};
|
||||
|
||||
// Minimum number of concrete characters at the beginning of user patterns.
|
||||
static const int kSaneNumConcreteChars = 4;
|
||||
// Various unicode whitespace characters are used to denote unichar patterns,
|
||||
@ -73,6 +79,9 @@ class Trie : public Dawg {
|
||||
static const char kLowerPatternUnicode[];
|
||||
static const char kUpperPatternUnicode[];
|
||||
|
||||
static const char *get_reverse_policy_name(
|
||||
RTLReversePolicy reverse_policy);
|
||||
|
||||
// max_num_edges argument allows limiting the amount of memory this
|
||||
// Trie can consume (if a new word insert would cause the Trie to
|
||||
// contain more edges than max_num_edges, all the edges are cleared
|
||||
@ -86,7 +95,7 @@ class Trie : public Dawg {
|
||||
new_dawg_node(); // need to allocate node 0
|
||||
initialized_patterns_ = false;
|
||||
}
|
||||
~Trie() { nodes_.delete_data_pointers(); }
|
||||
virtual ~Trie() { nodes_.delete_data_pointers(); }
|
||||
|
||||
// Reset the Trie to empty.
|
||||
void clear();
|
||||
@ -149,8 +158,11 @@ class Trie : public Dawg {
|
||||
SquishedDawg *trie_to_dawg();
|
||||
|
||||
// Inserts the list of words from the given file into the Trie.
|
||||
// If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl()
|
||||
// on each word before inserting it into the Trie.
|
||||
bool read_word_list(const char *filename,
|
||||
const UNICHARSET &unicharset);
|
||||
const UNICHARSET &unicharset,
|
||||
Trie::RTLReversePolicy reverse);
|
||||
|
||||
// Inserts the list of patterns from the given file into the Trie.
|
||||
// The pattern list file should contain one pattern per line in UTF-8 format.
|
||||
@ -225,10 +237,13 @@ class Trie : public Dawg {
|
||||
// whether the unichar id with the corresponding index in the word is allowed
|
||||
// to repeat an unlimited number of times. For each entry that is true, MARKER
|
||||
// flag of the corresponding edge created for this unichar id is set to true).
|
||||
void add_word_to_dawg(const WERD_CHOICE &word,
|
||||
//
|
||||
// Return true if add succeeded, false otherwise (e.g. when a word contained
|
||||
// an invalid unichar id or the trie was getting too large and was cleared).
|
||||
bool add_word_to_dawg(const WERD_CHOICE &word,
|
||||
const GenericVector<bool> *repetitions);
|
||||
void add_word_to_dawg(const WERD_CHOICE &word) {
|
||||
add_word_to_dawg(word, NULL);
|
||||
bool add_word_to_dawg(const WERD_CHOICE &word) {
|
||||
return add_word_to_dawg(word, NULL);
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -377,11 +392,11 @@ class Trie : public Dawg {
|
||||
UNICHAR_ID character_class_to_pattern(char ch);
|
||||
|
||||
// Member variables
|
||||
TRIE_NODES nodes_; ///< vector of nodes in the Trie
|
||||
uinT64 num_edges_; ///< sum of all edges (forward and backward)
|
||||
uinT64 max_num_edges_; ///< maximum number of edges allowed
|
||||
uinT64 deref_direction_mask_; ///< mask for EDGE_REF to extract direction
|
||||
uinT64 deref_node_index_mask_; ///< mask for EDGE_REF to extract node index
|
||||
TRIE_NODES nodes_; // vector of nodes in the Trie
|
||||
uinT64 num_edges_; // sum of all edges (forward and backward)
|
||||
uinT64 max_num_edges_; // maximum number of edges allowed
|
||||
uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
|
||||
uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
|
||||
// Variables for translating character class codes denoted in user patterns
|
||||
// file to the unichar ids used to represent them in a Trie.
|
||||
bool initialized_patterns_;
|
||||
|
Loading…
Reference in New Issue
Block a user