tesseract/dict/permute.cpp
theraysmith 3a13d80d24 Changes to dict for 3.00
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2009-07-11 02:20:33 +00:00

1507 lines
57 KiB
C++

/* -*-C-*-
********************************************************************************
*
* File: permute.c (Formerly permute.c)
* Description: Choose OCR text given character-probability maps
* for sequences of glyph fragments and a dictionary provided as
* a Dual Acyclic Word Graph.
* In this file, "permute" should be read "combine."
* Author: Mark Seaman, OCR Technology
* Created: Fri Sep 22 14:05:51 1989
* Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
/*----------------------------------------------------------------------
I n c l u d e s
---------------------------------------------------------------------*/
#include <assert.h>
#include <math.h>
#include "const.h"
#include "permute.h"
#include "callcpp.h"
#include "context.h"
#include "conversion.h"
#include "freelist.h"
#include "globals.h"
#include "ndminx.h"
#include "permdawg.h"
#include "permngram.h"
#include "ratngs.h"
#include "stopper.h"
#include "tordvars.h"
#include "tprintf.h"
#include "trie.h"
#include "varable.h"
#include "unicharset.h"
#include "dict.h"
#include "image.h"
#include "ccutil.h"
int permutation_count; // Used in metrics.cpp.
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
// TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
// (or make it dynamic)
#define MAX_NUM_EDGES 2000000
#define MAX_DOC_EDGES 250000
#define MAX_USER_EDGES 50000
/* Weights for adjustment */
#define NON_WERD 1.25
#define GARBAGE_STRING 1.5
#define MAX_PERM_LENGTH 128
// debugging flags
INT_VAR(fragments_debug, 0, "Debug character fragments");
BOOL_VAR(segment_debug, 0, "Debug the whole segmentation process");
BOOL_VAR(permute_debug, 0, "Debug char permutation process");
// control parameters
double_VAR(bestrate_pruning_factor, 2.0,
"Multiplying factor of current best rate to prune other hypotheses");
BOOL_VAR(permute_script_word, 0,
"Turn on word script consistency permuter");
BOOL_VAR(segment_segcost_rating, 0,
"incorporate segmentation cost in word rating?");
double_VAR(segment_reward_script, 0.95,
"Score multipler for script consistency within a word. "
"Being a 'reward' factor, it should be <= 1. "
"Smaller value implies bigger reward.");
double_VAR(segment_penalty_dict_nonword, NON_WERD,
"Score multiplier for glyph fragment segmentations which do not "
"match a dictionary word (lower is better).");
double_VAR(segment_penalty_garbage, GARBAGE_STRING,
"Score multiplier for poorly cased strings that are not in the "
"dictionary and generally look like garbage (lower is better).");
BOOL_VAR(save_doc_words, 0, "Save Document Words");
BOOL_VAR(doc_dict_enable, 1, "Enable Document Dictionary ");
BOOL_VAR(ngram_permuter_activated, FALSE,
"Activate character-level n-gram-based permuter");
STRING_VAR(global_user_words_suffix, "", "A list of user-provided words.");
// This is an ugly way to incorporate segmentation cost in word rating.
// See comments in incorporate_segcost.
float wordseg_rating_adjust_factor;
int permute_only_top = 0;
#define SIM_CERTAINTY_SCALE -10.0 /* Similarity matcher values */
#define SIM_CERTAINTY_OFFSET -10.0 /* Similarity matcher values */
#define SIMILARITY_FLOOR 100.0 /* Worst E*L product to stop on */
// TODO(daria): If hyphens are different in different languages and can be
// inferred from training data we should load their values dynamically.
static const char kHyphenSymbol[] = "-";
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
/**********************************************************************
* get_best_delete_other
*
* Returns the best of two choices and deletes the other (worse) choice.
* A choice is better if it has a non-empty string and has a lower
* rating than the other choice. If the ratings are the same,
* choice2 is preferred over choice1.
**********************************************************************/
WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1,
WERD_CHOICE *choice2) {
if (!choice1) return choice2;
if (!choice2) return choice1;
if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
delete choice2;
return choice1;
} else {
delete choice1;
return choice2;
}
}
/**********************************************************************
* good_choice
*
* Return TRUE if a good answer is found for the unknown blob rating.
**********************************************************************/
int good_choice(const WERD_CHOICE &choice) {
register float certainty;
if (tord_similarity_enable) {
if ((choice.rating() + 1) * choice.certainty() > SIMILARITY_FLOOR)
return false;
certainty =
SIM_CERTAINTY_OFFSET + choice.rating() * SIM_CERTAINTY_SCALE;
} else {
certainty = choice.certainty();
}
return (certainty > tord_certainty_threshold) ? true : false;
}
/**********************************************************************
* add_document_word
*
* Add a word found on this document to the document specific
* dictionary.
**********************************************************************/
namespace tesseract {
void Dict::add_document_word(const WERD_CHOICE &best_choice) {
// Do not add hyphenated word parts to the document dawg.
// hyphen_word_ will be non-NULL after the set_hyphen_word() is
// called when the first part of the hyphenated word is
// discovered and while the second part of the word is recognized.
// hyphen_word_ is cleared in cc_recg() before the next word on
// the line is recognized.
if (hyphen_word_) return;
char filename[CHARS_PER_LINE];
FILE *doc_word_file;
int stringlen = best_choice.length();
if (!doc_dict_enable || valid_word(best_choice) ||
CurrentWordAmbig() || stringlen < 2)
return;
if (!good_choice(best_choice) || stringlen == 2) {
if (best_choice.certainty() < permuter_pending_threshold)
return;
if (!pending_words_->word_in_dawg(best_choice)) {
if (stringlen > 2 ||
(stringlen == 2 &&
getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
pending_words_->add_word_to_dawg(best_choice);
}
return;
}
}
if (save_doc_words) {
strcpy(filename, getImage()->getCCUtil()->imagefile.string());
strcat (filename, ".doc");
doc_word_file = open_file (filename, "a");
fprintf (doc_word_file, "%s\n",
best_choice.debug_string(getUnicharset()).string());
fclose(doc_word_file);
}
document_words_->add_word_to_dawg(best_choice);
}
/**********************************************************************
* adjust_non_word
*
* Assign an adjusted value to a string that is a non-word. The value
* that this word choice has is based on case and punctuation rules.
* The adjustment value applied is stored in adjust_factor upon return.
**********************************************************************/
void Dict::adjust_non_word(WERD_CHOICE *word, float *adjust_factor) {
float new_rating;
if (permute_debug)
cprintf("Non-word: %s %4.2f ",
word->debug_string(getUnicharset()).string(), word->rating());
new_rating = word->rating() + RATING_PAD;
if (Context::case_ok(*word, getUnicharset()) && valid_punctuation(*word)) {
new_rating *= segment_penalty_dict_nonword;
*adjust_factor = segment_penalty_dict_nonword;
if (permute_debug) tprintf(", W");
} else {
new_rating *= segment_penalty_garbage;
*adjust_factor = segment_penalty_garbage;
if (permute_debug) {
if (!Context::case_ok(*word, getUnicharset())) tprintf(", C");
if (!valid_punctuation(*word)) tprintf(", P");
}
}
new_rating -= RATING_PAD;
word->set_rating(new_rating);
if (permute_debug)
cprintf (" %4.2f --> %4.2f\n", *adjust_factor, new_rating);
}
/**********************************************************************
* init_permute
*
* Initialize anything that needs to be set up for the permute
* functions.
**********************************************************************/
void Dict::init_permute() {
STRING name;
STRING &lang = getImage()->getCCUtil()->lang;
if (dawgs_.length() != 0) end_permute();
hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
TessdataManager &tessdata_manager =
getImage()->getCCUtil()->tessdata_manager;
// Load dawgs_.
if (global_load_punc_dawg &&
tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM);
}
if (global_load_system_dawg &&
tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM);
}
if (global_load_number_dawg &&
tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
dawgs_ +=
new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_NUMBER, lang, NUMBER_PERM);
}
if (((STRING &)global_user_words_suffix).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
MAX_USER_EDGES, getUnicharset().size());
name = getImage()->getCCUtil()->language_data_path_prefix;
name += global_user_words_suffix;
if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
tprintf("Error: failed to load %s\n", name.string());
exit(1);
}
dawgs_ += trie_ptr;
}
document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
MAX_DOC_EDGES, getUnicharset().size());
dawgs_ += document_words_;
// This dawg is temporary and should not be searched by letter_is_ok.
pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
MAX_DOC_EDGES, getUnicharset().size());
// The frequent words dawg is only searched when a word
// is found in any of the other dawgs.
if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM);
}
// Construct a list of corresponding successors for each dawg. Each entry i
// in the successors_ vector is a vector of integers that represent the
// indices into the dawgs_ vector of the successors for dawg i.
successors_.reserve(dawgs_.length());
for (int i = 0; i < dawgs_.length(); ++i) {
const Dawg *dawg = dawgs_[i];
SuccessorList *lst = new SuccessorList();
for (int j = 0; j < dawgs_.length(); ++j) {
const Dawg *other = dawgs_[j];
if (dawg->lang() == other->lang() &&
kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
}
successors_ += lst;
}
}
void Dict::end_permute() {
if (dawgs_.length() == 0)
return; // Not safe to call twice.
dawgs_.delete_data_pointers();
successors_.delete_data_pointers();
dawgs_.clear();
successors_.clear();
document_words_ = NULL;
if (pending_words_ != NULL) delete pending_words_;
pending_words_ = NULL;
if (freq_dawg_ != NULL) delete freq_dawg_;
freq_dawg_ = NULL;
}
/**********************************************************************
* permute_all
*
* Permute all the characters together using all of the different types
* of permuters/selectors available. Each of the characters must have
* a non-NULL choice list.
*
* Note: order of applying permuters does matter, since the latter
* permuter will be recorded if the resulting word ratings are the same.
**********************************************************************/
WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit,
WERD_CHOICE *raw_choice) {
WERD_CHOICE *result1;
WERD_CHOICE *result2 = NULL;
BOOL8 any_alpha;
float top_choice_rating_limit = rating_limit;
// Initialize result1 from the result of permute_top_choice.
result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
raw_choice, &any_alpha);
// Enforce script consistency within a word on some scripts
if (permute_script_word &&
!word_script_eq(char_choices, getUnicharset().common_sid()) &&
!word_script_eq(char_choices, getUnicharset().latin_sid())) {
result2 = permute_script_words(char_choices);
// TODO(dsl): incorporate segmentation cost into word rating.
// This should only be turned on for scripts that we have a segmentation
// cost model for, such as CJK.
if (segment_segcost_rating)
incorporate_segcost(result2);
result1 = get_best_delete_other(result1, result2);
}
// Permute character fragments if necessary.
if (result1 == NULL || result1->fragment_mark()) {
result2 = top_fragments_permute_and_select(char_choices,
top_choice_rating_limit);
result1 = get_best_delete_other(result1, result2);
}
// TODO(daria): update ngram permuter code.
if (ngram_permuter_activated) {
tprintf("Error: ngram permuter functionality is not available\n");
exit(1);
// A_CHOICE *ngram_choice =
// ngram_permute_and_select(old_char_choices, rating_limit, word_dawg_);
// return ngram_choice;
}
if (result1 == NULL)
return (NULL);
if (permute_only_top)
return result1;
result2 = dawg_permute_and_select(char_choices, rating_limit);
result1 = get_best_delete_other(result1, result2);
result2 = permute_compound_words(char_choices, rating_limit);
result1 = get_best_delete_other(result1, result2);
return (result1);
}
// Returns the top choice char id. A helper function to make code cleaner.
UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) {
BLOB_CHOICE_IT blob_choice_it;
blob_choice_it.set_to_list(blob_list);
return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
: INVALID_UNICHAR_ID;
}
// Return the "dominant" script ID for the word. By "dominant", the script
// must account for at least half the characters. Otherwise, it returns 0.
int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
const UNICHARSET &unicharset) {
int max_script = unicharset.get_script_table_size();
int *sid = new int[max_script];
int x;
for (x = 0; x < max_script; x++) sid[x] = 0;
for (x = 0; x < char_choices.length(); ++x) {
BLOB_CHOICE_IT blob_choice_it;
blob_choice_it.set_to_list(char_choices.get(x));
sid[blob_choice_it.data()->script_id()]++;
}
// Note that high script ID overrides lower one on a tie, thus biasing
// towards non-Common script (if sorted that way in unicharset file).
int max_sid = 0;
for (x = 1; x < max_script; x++)
if (sid[x] >= sid[max_sid]) max_sid = x;
if (sid[max_sid] < char_choices.length() / 2)
max_sid = unicharset.null_sid();
delete[] sid;
return max_sid;
}
/**********************************************************************
* Checks whether the dominant word script, if there is one, matches
* the given target script ID.
**********************************************************************/
bool Dict::word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
int target_sid) {
int max_sid = get_top_word_script(char_choices, getUnicharset());
// If "Latin" is not a loaded script, then latin_sid() would return 0.
// max_sid could also be 0 if there is no dominant script.
// This is faster than
// strcmp(getUnicharset().get_script_from_script_id(max_sid), "Latin")
return (max_sid > 0 && max_sid == target_sid);
}
/**********************************************************************
* Iterate through all the character choices (for a single blob) and
* return the first that matches the given type, which is one of 'aA0px*',
* for lower, upper, digit, punctuation, other, and 'any', respectively.
* If not match is found, a NULL is returned.
**********************************************************************/
BLOB_CHOICE* find_choice_by_type(
BLOB_CHOICE_LIST *char_choices,
char target_type,
const UNICHARSET &unicharset) {
BLOB_CHOICE_IT c_it;
c_it.set_to_list(char_choices);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
bool found = false;
UNICHAR_ID unichar_id = c_it.data()->unichar_id();
switch (target_type) {
case '*': found = true; break;
case 'A': found = unicharset.get_isupper(unichar_id); break;
case 'a': found = unicharset.get_islower(unichar_id); break;
case '0': found = unicharset.get_isdigit(unichar_id); break;
case 'p': found = unicharset.get_ispunctuation(unichar_id); break;
case 'x': found = !unicharset.get_isupper(unichar_id) &&
!unicharset.get_islower(unichar_id) &&
!unicharset.get_isdigit(unichar_id) &&
!unicharset.get_ispunctuation(unichar_id);
break;
}
if (found) return c_it.data();
}
return NULL;
}
/**********************************************************************
* Iterate through all the character choices (for a single blob) and
* return the first that matches the target script ID. If backup_sid
* is not 0, then a match on either the target or backup sid is allowed.
* Note that there is no preference between a target or backup sid.
* To search for another sid only if no target_sid matched, use
* secondary_sid.
* So for example, to find first Han or Common char choice, do
* find_choice_by_script(cchoice, han_sid, common_sid, 0);
* To find first Han choice, but allow Common if none is found, do
* find_choice_by_script(cchoice, han_sid, 0, common_sid);
**********************************************************************/
BLOB_CHOICE* find_choice_by_script(
BLOB_CHOICE_LIST *char_choices,
int target_sid,
int backup_sid,
int secondary_sid) {
BLOB_CHOICE_IT c_it;
c_it.set_to_list(char_choices);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
bool found = false;
if (c_it.data()->script_id() == 0) continue;
if (c_it.data()->script_id() == target_sid) found = true;
if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
if (found) return c_it.data();
}
if (secondary_sid > 0) {
c_it.set_to_list(char_choices);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
if (c_it.data()->script_id() == 0) continue;
if (c_it.data()->script_id() == secondary_sid)
return c_it.data();
}
}
return NULL;
}
/**********************************************************************
* Incorporate segmentation cost into the word rating. This is done
* through a mutliplier wordseg_rating_adjust_factor which is determined
* in bestfirst.cpp during state evaluation. This is not the cleanest
* way to do this. It would be better to reorganize the SEARCH_STATE
* to keep track of associated states, or do the rating adjustment
* outside the permuter in evalaute_state.
**********************************************************************/
void Dict::incorporate_segcost(WERD_CHOICE *word) {
if (!word || wordseg_rating_adjust_factor <= 0) return;
float old_rating = word->rating();
float new_rating = old_rating * wordseg_rating_adjust_factor;
word->set_rating(new_rating);
if (permute_debug)
tprintf("Permute segadjust %f * %f --> %f\n",
old_rating, wordseg_rating_adjust_factor, new_rating);
}
/**********************************************************************
* Try flipping characters in a word to get better script consistency.
* Similar to how upper/lower case checking is done in top_choice_permuter,
* this permuter tries to suggest a more script-consistent choice AND
* modifieds the rating. So it combines both the case_ok check and
* adjust_non_word functionality. However, instead of penalizing an
* inconsistent word with a > 1 multiplier, we reward the script-consistent
* choice with a < 1 multiplier.
**********************************************************************/
WERD_CHOICE* Dict::permute_script_words(
const BLOB_CHOICE_LIST_VECTOR &char_choices) {
if (char_choices.length() > MAX_WERD_LENGTH)
return NULL;
int word_sid = get_top_word_script(char_choices, getUnicharset());
if (word_sid == getUnicharset().null_sid())
return NULL;
if (permute_debug) {
tprintf("\n\nPermuteScript %s\n",
getUnicharset().get_script_from_script_id(word_sid));
print_char_choices_list("", char_choices, getUnicharset(),
permute_debug > 1);
}
WERD_CHOICE *current_word = new WERD_CHOICE(MAX_WERD_LENGTH);
BLOB_CHOICE_IT blob_choice_it;
bool replaced = false;
bool prev_is_consistent = false;
for (int x = 0; x < char_choices.length(); ++x) {
blob_choice_it.set_to_list(char_choices.get(x));
BLOB_CHOICE *first_choice = blob_choice_it.data();
if (!first_choice) return NULL;
UNICHAR_ID unichar_id = first_choice->unichar_id();
bool sid_consistent = (first_choice->script_id() == word_sid);
bool this_is_punct = getUnicharset().get_ispunctuation(unichar_id);
if (!sid_consistent && !this_is_punct && prev_is_consistent) {
// If the previous char is CJK, we prefer a cjk over non-cjk char
if (permute_debug) {
tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
first_choice->rating());
print_ratings_list("\t", char_choices.get(x), getUnicharset());
}
// prefer a script consistent choice
BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
word_sid, 0, 0);
// make this a separate check
// otherwise, prefer a punctuation
if (c_it == NULL)
c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());
if (c_it != NULL) {
if (permute_debug)
tprintf("Replacing %d r%g ==> %d r%g\n",
first_choice->unichar_id(), first_choice->rating(),
c_it->unichar_id(), c_it->rating());
first_choice = c_it;
replaced = true;
}
}
current_word->append_unichar_id_space_allocated(
first_choice->unichar_id(), 1,
first_choice->rating(), first_choice->certainty());
prev_is_consistent = sid_consistent;
}
if (replaced) {
// When we replace a word choice (usually top choice) with
// another for the sake of script consistency, we need to improve its
// rating so that it will replace the best choice. How much we modify
// the rating determines how strong is the script consistency constraint.
// We need a more consistent solution for all contextual constraints
// like case, punct pattern, script, etc. Right now, this does the same
// thing as adjust_non_words for case and punctuation rules.
float rating = current_word->rating();
rating *= segment_reward_script;
current_word->set_rating(rating);
}
current_word->populate_unichars(getUnicharset());
if (permute_debug && replaced)
current_word->print("<== permute_script_word **");
return current_word;
}
/**********************************************************************
* permute_characters
*
* Permute these characters together according to each of the different
* permuters that are enabled.
**********************************************************************/
void Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float limit,
WERD_CHOICE *best_choice,
WERD_CHOICE *raw_choice) {
float old_raw_choice_rating = raw_choice->rating();
permutation_count++; /* Global counter */
if (tord_display_ratings > 1) {
cprintf("\nchar_choices in permute_characters:\n");
print_char_choices_list("\n==> Input CharChoices", char_choices,
getUnicharset(), true);
}
if (char_choices.length() == 1 &&
get_top_choice_uid(char_choices.get(0)) == 0)
return;
WERD_CHOICE *this_choice = permute_all(char_choices, limit, raw_choice);
if (raw_choice->rating() < old_raw_choice_rating) {
// Populate unichars_ and unichar_lengths_ of raw_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (e.g. AdaptToWord).
raw_choice->populate_unichars(getUnicharset());
}
if (this_choice && this_choice->rating() < best_choice->rating()) {
*best_choice = *this_choice;
// Populate unichars_ and unichar_lengths_ of best_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (dawg, *_ok functions, various hard-coded hacks).
best_choice->populate_unichars(getUnicharset());
if (tord_display_ratings) {
cprintf("permute_characters: %s\n",
best_choice->debug_string(getUnicharset()).string());
}
}
delete this_choice;
}
/**********************************************************************
* permute_compound_words
*
* Return the top choice for each character as the choice for the word.
**********************************************************************/
WERD_CHOICE *Dict::permute_compound_words(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit) {
BLOB_CHOICE *first_choice;
WERD_CHOICE *best_choice = NULL;
WERD_CHOICE current_word(MAX_WERD_LENGTH);
int first_index = 0;
int x;
BLOB_CHOICE_IT blob_choice_it;
if (char_choices.length() > MAX_WERD_LENGTH) {
WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
bad_word_choice->make_bad();
return bad_word_choice;
}
UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
for (x = 0; x < char_choices.length(); ++x) {
blob_choice_it.set_to_list(char_choices.get(x));
first_choice = blob_choice_it.data();
if (first_choice->unichar_id() == slash ||
first_choice->unichar_id() == dash) {
if (x > first_index) {
if (segment_debug)
cprintf ("Hyphenated word found\n");
permute_subword(char_choices, rating_limit, first_index,
x - 1, &current_word);
if (current_word.rating() > rating_limit)
break;
}
// Append hyphen/slash separator to current_word.
current_word.append_unichar_id_space_allocated(
first_choice->unichar_id(), 1,
first_choice->rating(), first_choice->certainty());
first_index = x + 1; // update first_index
}
}
if (first_index > 0 && first_index < x &&
current_word.rating() <= rating_limit) {
permute_subword(char_choices, rating_limit, first_index,
x - 1, &current_word);
best_choice = new WERD_CHOICE(current_word);
best_choice->set_permuter(COMPOUND_PERM);
}
return (best_choice);
}
/**********************************************************************
* permute_subword
*
* Permute a part of a compound word this subword is bounded by hyphens
* and the start and end of the word. Call the standard word permute
* function on a set of choices covering only part of the original
* word. When it is done reclaim the memory that was used in the
* excercise.
**********************************************************************/
void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit,
int start,
int end,
WERD_CHOICE *current_word) {
int x;
BLOB_CHOICE_LIST_VECTOR subchoices;
WERD_CHOICE *best_choice = NULL;
WERD_CHOICE raw_choice;
raw_choice.make_bad();
DisableChoiceAccum();
for (x = start; x <= end; x++) {
if (char_choices.get(x) != NULL) {
subchoices += char_choices.get(x);
}
}
if (!subchoices.empty()) {
bool old_segment_dawg_debug = segment_dawg_debug;
if (segment_debug) segment_dawg_debug.set_value(true);
best_choice = permute_all(subchoices, rating_limit, &raw_choice);
if (segment_debug) {
segment_dawg_debug.set_value(old_segment_dawg_debug);
}
if (best_choice && best_choice->length() > 0) {
*current_word += *best_choice;
} else {
current_word->set_rating(MAX_FLOAT32);
}
} else {
current_word->set_rating(MAX_FLOAT32);
}
if (best_choice)
delete best_choice;
if (segment_debug && current_word->rating() < MAX_FLOAT32) {
cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
current_word->debug_string(getUnicharset()).string(),
current_word->rating(), current_word->certainty());
}
EnableChoiceAccum();
}
/**********************************************************************
* permute_top_choice
*
* Return the top choice for each character as the choice for the word.
* In addition a choice is created for the best lower and upper case
* non-words. In each character position the best lower (or upper) case
* character is substituted for the best overall character.
**********************************************************************/
WERD_CHOICE *Dict::permute_top_choice(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float* rating_limit,
WERD_CHOICE *raw_choice,
BOOL8 *any_alpha) {
BLOB_CHOICE *first_choice;
const char *first_char; //first choice
const char *second_char; //second choice
const char *third_char; //third choice
char prev_char[UNICHAR_LEN + 1]; //prev in word
const char *next_char = ""; //next in word
const char *next_next_char = ""; //after next next in word
WERD_CHOICE word(MAX_PERM_LENGTH);
word.set_permuter(TOP_CHOICE_PERM);
WERD_CHOICE capital_word(MAX_PERM_LENGTH);
capital_word.set_permuter(UPPER_CASE_PERM);
WERD_CHOICE lower_word(MAX_PERM_LENGTH);
lower_word.set_permuter(LOWER_CASE_PERM);
int x;
BOOL8 char_alpha;
float first_rating = 0;
float adjust_factor;
float certainties[MAX_PERM_LENGTH + 1];
float lower_certainties[MAX_PERM_LENGTH + 1];
float upper_certainties[MAX_PERM_LENGTH + 1];
BLOB_CHOICE_IT blob_choice_it;
UNICHAR_ID temp_id;
UNICHAR_ID unichar_id;
UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
register const char* ch;
register inT8 lower_done;
register inT8 upper_done;
prev_char[0] = '\0';
if (any_alpha != NULL)
*any_alpha = FALSE;
if (char_choices.length() > MAX_PERM_LENGTH) {
return (NULL);
}
for (x = 0; x < char_choices.length(); ++x) {
if (x + 1 < char_choices.length()) {
unichar_id = get_top_choice_uid(char_choices.get(x+1));
next_char = unichar_id != INVALID_UNICHAR_ID ?
getUnicharset().id_to_unichar(unichar_id) : "";
} else {
next_char = "";
}
if (x + 2 < char_choices.length()) {
unichar_id = get_top_choice_uid(char_choices.get(x+2));
next_next_char = unichar_id != INVALID_UNICHAR_ID ?
getUnicharset().id_to_unichar(unichar_id) : "";
} else {
next_next_char = "";
}
blob_choice_it.set_to_list(char_choices.get(x));
ASSERT_HOST(!blob_choice_it.empty());
first_choice = NULL;
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) { // find the best non-fragment char choice
temp_id = blob_choice_it.data()->unichar_id();
if (!(getUnicharset().get_fragment(temp_id))) {
first_choice = blob_choice_it.data();
break;
} else if (char_choices.length() > 1) {
word.set_fragment_mark(true);
capital_word.set_fragment_mark(true);
lower_word.set_fragment_mark(true);
}
}
if (first_choice == NULL) {
cprintf("Permuter found only fragments for"
" character at position %d; word=%s\n",
x, word.debug_string(getUnicharset()).string());
}
ASSERT_HOST(first_choice != NULL);
unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
first_choice->unichar_id() : space;
first_char = getUnicharset().id_to_unichar(unichar_id);
first_rating = first_choice->rating();
word.append_unichar_id_space_allocated(
unichar_id, 1, first_choice->rating(), first_choice->certainty());
capital_word.append_unichar_id_space_allocated(
unichar_id, 1, first_choice->rating(), first_choice->certainty());
lower_word.append_unichar_id_space_allocated(
unichar_id, 1, first_choice->rating(), first_choice->certainty());
certainties[x] = first_choice->certainty();
lower_certainties[x] = first_choice->certainty();
upper_certainties[x] = first_choice->certainty();
lower_done = FALSE;
upper_done = FALSE;
char_alpha = FALSE;
second_char = "";
third_char = "";
for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
unichar_id = blob_choice_it.data()->unichar_id();
if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
blob_choice_it.data_relative(1)->rating() == first_rating) {
temp_id = blob_choice_it.data_relative(1)->unichar_id();
if (getUnicharset().eq(temp_id, "1") ||
getUnicharset().eq(temp_id, "I")) {
second_char = getUnicharset().id_to_unichar(temp_id);
blob_choice_it.forward();
if (!blob_choice_it.at_last() &&
blob_choice_it.data_relative(1)->rating() == first_rating) {
temp_id = blob_choice_it.data_relative(1)->unichar_id();
if (getUnicharset().eq(temp_id, "1") ||
getUnicharset().eq(temp_id, "I")) {
third_char = getUnicharset().id_to_unichar(temp_id);
blob_choice_it.forward();
}
}
ch = choose_il1 (first_char, second_char, third_char,
prev_char, next_char, next_next_char);
unichar_id = (ch != NULL && *ch != '\0') ?
getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
if (strcmp(ch, "l") != 0 &&
getUnicharset().eq(word.unichar_id(x), "l")) {
word.set_unichar_id(unichar_id, x);
lower_word.set_unichar_id(unichar_id, x);
capital_word.set_unichar_id(unichar_id, x);
}
}
}
if (unichar_id != INVALID_UNICHAR_ID) {
/* Find lower case */
if (!lower_done &&
(getUnicharset().get_islower(unichar_id) ||
(getUnicharset().get_isupper(unichar_id) && x == 0))) {
lower_word.set_unichar_id(unichar_id, x);
lower_word.set_rating(lower_word.rating() -
first_choice->rating() + blob_choice_it.data()->rating());
if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
lower_word.set_certainty(blob_choice_it.data()->certainty());
}
lower_certainties[x] = blob_choice_it.data()->certainty();
lower_done = TRUE;
}
/* Find upper case */
if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
capital_word.set_unichar_id(unichar_id, x);
capital_word.set_rating(capital_word.rating() -
first_choice->rating() + blob_choice_it.data()->rating());
if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
capital_word.set_certainty(blob_choice_it.data()->certainty());
}
upper_certainties[x] = blob_choice_it.data()->certainty();
upper_done = TRUE;
}
if (!char_alpha) {
const CHAR_FRAGMENT *fragment =
getUnicharset().get_fragment(unichar_id);
temp_id = !fragment ? unichar_id :
getUnicharset().unichar_to_id(fragment->get_unichar());
if (getUnicharset().get_isalpha(temp_id)) {
char_alpha = TRUE;
}
}
if (lower_done && upper_done)
break;
}
}
if (char_alpha && any_alpha != NULL)
*any_alpha = TRUE;
if (word.rating() > bestrate_pruning_factor * *rating_limit) {
if (permute_debug)
tprintf("\n***** Aborting high-cost word: %g > limit %g \n",
word.rating(), bestrate_pruning_factor * *rating_limit);
return (NULL);
}
*prev_char = '\0';
temp_id = word.unichar_id(word.length()-1);
if (temp_id != INVALID_UNICHAR_ID) {
strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
}
}
if (word.rating() < raw_choice->rating()) {
*raw_choice = word;
LogNewChoice(*raw_choice, 1.0, certainties, true);
}
if (ngram_permuter_activated)
return NULL;
float rating = word.rating();
adjust_non_word(&word, &adjust_factor);
LogNewChoice(word, adjust_factor, certainties, false);
float lower_rating = lower_word.rating();
adjust_non_word(&lower_word, &adjust_factor);
LogNewChoice(lower_word, adjust_factor, lower_certainties, false);
float upper_rating = capital_word.rating();
adjust_non_word(&capital_word, &adjust_factor);
LogNewChoice(capital_word, adjust_factor, upper_certainties, false);
WERD_CHOICE *best_choice = &word;
*rating_limit = rating;
if (lower_word.rating() < best_choice->rating()) {
best_choice = &lower_word;
*rating_limit = lower_rating;
}
if (capital_word.rating() < best_choice->rating()) {
best_choice = &capital_word;
*rating_limit = upper_rating;
}
return new WERD_CHOICE(*best_choice);
}
/**********************************************************************
* choose_il1
*
* Choose between the candidate il1 chars.
**********************************************************************/
const char* Dict::choose_il1(const char *first_char, //first choice
const char *second_char, //second choice
const char *third_char, //third choice
const char *prev_char, //prev in word
const char *next_char, //next in word
const char *next_next_char) { //after next next in word
inT32 type1; //1/I/l type of first choice
inT32 type2; //1/I/l type of second choice
inT32 type3; //1/I/l type of third choice
int first_char_length = strlen(first_char);
int prev_char_length = strlen(prev_char);
int next_char_length = strlen(next_char);
int next_next_char_length = strlen(next_next_char);
if (*first_char == 'l' && *second_char != '\0') {
if (*second_char == 'I'
&& (((prev_char_length != 0 &&
getUnicharset().get_isupper (prev_char, prev_char_length)) &&
(next_char_length == 0 ||
!getUnicharset().get_islower (next_char, next_char_length)) &&
(next_char_length == 0 ||
!getUnicharset().get_isdigit (next_char, next_char_length))) ||
((next_char_length != 0 &&
getUnicharset().get_isupper (next_char, next_char_length)) &&
(prev_char_length == 0 ||
!getUnicharset().get_islower (prev_char, prev_char_length)) &&
(prev_char_length == 0 ||
!getUnicharset().get_isdigit (prev_char, prev_char_length)))))
first_char = second_char; //override
else if (*second_char == '1' || *third_char == '1') {
if ((next_char_length != 0 &&
getUnicharset().get_isdigit (next_char, next_char_length)) ||
(prev_char_length != 0 &&
getUnicharset().get_isdigit (prev_char, prev_char_length))
|| (*next_char == 'l' &&
(next_next_char_length != 0 &&
getUnicharset().get_isdigit (next_next_char,
next_next_char_length)))) {
first_char = "1";
first_char_length = 1;
}
else if ((prev_char_length == 0 ||
!getUnicharset().get_islower (prev_char, prev_char_length)) &&
((next_char_length == 0 ||
!getUnicharset().get_islower (next_char, next_char_length)) ||
(*next_char == 's' &&
*next_next_char == 't'))) {
if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
&& ((*next_char != '\'' && *next_char != '`')
|| *prev_char != '\0')) {
first_char = "1";
first_char_length = 1;
}
}
}
if (*first_char == 'l' && *next_char != '\0' &&
(prev_char_length == 0 ||
!getUnicharset().get_isalpha (prev_char, prev_char_length))) {
type1 = 2;
if (*second_char == '1')
type2 = 0;
else if (*second_char == 'I')
type2 = 1;
else if (*second_char == 'l')
type2 = 2;
else
type2 = type1;
if (*third_char == '1')
type3 = 0;
else if (*third_char == 'I')
type3 = 1;
else if (*third_char == 'l')
type3 = 2;
else
type3 = type1;
#if 0
if (bigram_counts[*next_char][type2] >
bigram_counts[*next_char][type1]) {
first_char = second_char;
type1 = type2;
}
if (bigram_counts[*next_char][type3] >
bigram_counts[*next_char][type1]) {
first_char = third_char;
}
#endif
}
}
return first_char;
}
//
// Check all the DAWGs to see if this word is in any of them.
//
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
const WERD_CHOICE *word_ptr = &word;
WERD_CHOICE temp_word;
if (hyphenated()) {
copy_hyphen_info(&temp_word);
temp_word += word;
word_ptr = &temp_word;
}
if (word_ptr->length() == 0) return NO_PERM;
// Allocate vectors for holding current and updated
// active_dawgs and constraints and initialize them.
DawgInfoVector *active_dawgs = new DawgInfoVector[2];
DawgInfoVector *constraints = new DawgInfoVector[2];
init_active_dawgs(&(active_dawgs[0]));
init_constraints(&(constraints[0]));
DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
&(active_dawgs[1]), &(constraints[1]), 0.0);
int last_index = word_ptr->length() - 1;
// Call leter_is_okay for each letter in the word.
for (int i = hyphen_base_size(); i <= last_index; ++i) {
if (!((this->*letter_is_okay_)(&dawg_args, i, word_ptr,
i == last_index))) break;
// Swap active_dawgs, constraints with the corresponding updated vector.
if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
dawg_args.updated_active_dawgs = &(active_dawgs[0]);
dawg_args.updated_constraints = &(constraints[0]);
++(dawg_args.active_dawgs);
++(dawg_args.constraints);
} else {
++(dawg_args.updated_active_dawgs);
++(dawg_args.updated_constraints);
dawg_args.active_dawgs = &(active_dawgs[0]);
dawg_args.constraints = &(constraints[0]);
}
}
delete[] active_dawgs;
delete[] constraints;
if (dawg_args.permuter == SYSTEM_DAWG_PERM ||
dawg_args.permuter == DOC_DAWG_PERM ||
dawg_args.permuter == USER_DAWG_PERM ||
(numbers_ok && dawg_args.permuter == NUMBER_PERM)){
return dawg_args.permuter;
} else {
return NO_PERM;
}
}
//
// Return true if the word contains a valid punctuation pattern.
//
// Note: Since the domains of punctuation symbols and symblos
// used in numbers are not disjoint, a valid number might contain
// an invalid punctuation pattern (e.g. .99).
//
bool Dict::valid_punctuation(const WERD_CHOICE &word) {
if (word.length() == 0) return NO_PERM;
int i;
WERD_CHOICE new_word;
int last_index = word.length() - 1;
int new_len = 0;
for (i = 0; i <= last_index; ++i) {
UNICHAR_ID unichar_id = (word.unichar_id(i));
if (getUnicharset().get_ispunctuation(unichar_id)) {
new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
} else if (!getUnicharset().get_isalpha(unichar_id) &&
!getUnicharset().get_isdigit(unichar_id)) {
return false; // neither punc, nor alpha, nor digit
} else if ((new_len = new_word.length()) == 0 ||
new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
}
}
for (i = 0; i < dawgs_.size(); ++i) {
if (dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
dawgs_[i]->word_in_dawg(new_word)) return true;
}
return false;
}
/**********************************************************************
* fragment_state
*
* Given the current char choice and information about previously seen
* fragments, determines whether adjacent character fragments are
* present and whether they can be concatenated.
*
* The given prev_char_frag_info contains:
* -- fragment: if not NULL contains information about immediately
* preceeding fragmented character choice
* -- num_fragments: number of fragments that have been used so far
* to construct a character
* -- certainty: certainty of the current choice or minimum
* certainty of all fragments concatenated so far
* -- rating: rating of the current choice or sum of fragment
* ratings concatenated so far
*
* The output char_frag_info is filled in as follows:
* -- character: is set to be NULL if the choice is a non-matching
* or non-ending fragment piece; is set to unichar of the given choice
* if it represents a regular character or a matching ending fragment
* -- fragment,num_fragments,certainty,rating are set as described above
*
* Returns false if a non-matching fragment is discovered, true otherwise.
**********************************************************************/
bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
float curr_rating, float curr_certainty,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
const char *debug, int word_ending,
CHAR_FRAGMENT_INFO *char_frag_info) {
const CHAR_FRAGMENT *this_fragment =
getUnicharset().get_fragment(curr_unichar_id);
const CHAR_FRAGMENT *prev_fragment =
prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
// Print debug info for fragments.
if (debug && (prev_fragment || this_fragment)) {
cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
getUnicharset().debug_str(curr_unichar_id).string(),
word_ending);
if (prev_fragment) {
cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
}
if (this_fragment) {
cprintf("this_fragment %s\n", this_fragment->to_string().string());
}
}
char_frag_info->unichar_id = curr_unichar_id;
char_frag_info->fragment = this_fragment;
char_frag_info->rating = curr_rating;
char_frag_info->certainty = curr_certainty;
char_frag_info->num_fragments = 1;
if (prev_fragment && !this_fragment) {
if (debug) tprintf("Skip choice with incomplete fragment\n");
return false;
}
if (this_fragment) {
// We are dealing with a fragment.
char_frag_info->unichar_id = INVALID_UNICHAR_ID;
if (prev_fragment) {
if (!this_fragment->is_continuation_of(prev_fragment)) {
if (debug) tprintf("Non-matching fragment piece\n");
return false;
}
if (this_fragment->is_ending()) {
char_frag_info->unichar_id =
getUnicharset().unichar_to_id(this_fragment->get_unichar());
char_frag_info->fragment = NULL;
if (debug) {
tprintf("Built character %s from fragments\n",
getUnicharset().debug_str(
char_frag_info->unichar_id).string());
}
} else {
if (debug) tprintf("Record fragment continuation\n");
char_frag_info->fragment = this_fragment;
}
// Update certainty and rating.
char_frag_info->rating =
prev_char_frag_info->rating + curr_rating;
char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
char_frag_info->certainty =
MIN(curr_certainty, prev_char_frag_info->certainty);
} else {
if (this_fragment->is_beginning()) {
if (debug) cprintf("Record fragment beginning\n");
} else {
if (debug) {
tprintf("Non-starting fragment piece with no prev_fragment\n");
}
return false;
}
}
}
if (word_ending && char_frag_info->fragment) {
if (debug) tprintf("Word can not end with a fragment\n");
return false;
}
return true;
}
/**********************************************************************
* top_fragments_permute_and_select
*
* Creates a copy of character choices list that contain only fragments
* and the best non-fragmented character choice.
* Permutes character in this shortened list, builds characters from
* fragments if possible and returns a better choice if found.
**********************************************************************/
WERD_CHOICE *Dict::top_fragments_permute_and_select(
const BLOB_CHOICE_LIST_VECTOR &char_choices,
float rating_limit) {
if (char_choices.length() <= 1 ||
char_choices.length() > MAX_PERM_LENGTH) {
return NULL;
}
// See it would be possible to benefit from permuting fragments.
int x;
float min_rating = 0.0;
BLOB_CHOICE_IT blob_choice_it;
for (x = 0; x < char_choices.length(); ++x) {
blob_choice_it.set_to_list(char_choices.get(x));
if (blob_choice_it.data()) {
min_rating += blob_choice_it.data()->rating();
}
if (min_rating >= rating_limit) {
return NULL;
}
}
if (fragments_debug > 1) {
tprintf("A choice with fragment beats top choice\n");
tprintf("Running fragment permuter...\n");
}
// Construct a modified choices list that contains (for each position):
// the best choice, all fragments and at least one choice for
// a non-fragmented character.
BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
for (x = 0; x < char_choices.length(); ++x) {
bool need_nonfrag_char = true;
BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
BLOB_CHOICE_IT frag_choices_it;
frag_choices_it.set_to_list(frag_choices);
blob_choice_it.set_to_list(char_choices.get(x));
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) {
if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
frag_choices_it.add_after_then_move(
new BLOB_CHOICE(*(blob_choice_it.data())));
} else if (need_nonfrag_char) {
frag_choices_it.add_after_then_move(
new BLOB_CHOICE(*(blob_choice_it.data())));
need_nonfrag_char = false;
}
}
frag_char_choices += frag_choices;
}
WERD_CHOICE *best_choice = new WERD_CHOICE();
best_choice->make_bad();
WERD_CHOICE word(MAX_PERM_LENGTH);
word.set_permuter(TOP_CHOICE_PERM);
float certainties[MAX_PERM_LENGTH];
this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
frag_char_choices, 0, NULL, &word, certainties,
&rating_limit, best_choice, NULL);
frag_char_choices.delete_data_pointers();
return best_choice;
}
/**********************************************************************
* permute_choices
*
* Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
* with the given char_choice_index in char_choices.
**********************************************************************/
void Dict::permute_choices(
const char *debug,
const BLOB_CHOICE_LIST_VECTOR &char_choices,
int char_choice_index,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
WERD_CHOICE *word,
float certainties[],
float *limit,
WERD_CHOICE *best_choice,
void *more_args) {
if (debug) {
tprintf("%s permute_choices: char_choice_index=%d"
" limit=%4.2f rating=%4.2f, certainty=%4.2f word=%s\n",
debug, char_choice_index, *limit, word->rating(),
word->certainty(), word->debug_string(getUnicharset()).string());
}
if (char_choice_index < char_choices.length()) {
BLOB_CHOICE_IT blob_choice_it;
blob_choice_it.set_to_list(char_choices.get(char_choice_index));
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) {
append_choices(debug, char_choices, *(blob_choice_it.data()),
char_choice_index, prev_char_frag_info, word,
certainties, limit, best_choice, more_args);
}
}
}
/**********************************************************************
* append_choices
*
* Check to see whether or not the next choice is worth appending to
* the word being generated. If so then keep going deeper into the word.
*
* This function assumes that Dict::go_deeper_fxn_ is set.
**********************************************************************/
void Dict::append_choices(
const char *debug,
const BLOB_CHOICE_LIST_VECTOR &char_choices,
const BLOB_CHOICE &blob_choice,
int char_choice_index,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
WERD_CHOICE *word,
float certainties[],
float *limit,
WERD_CHOICE *best_choice,
void *more_args) {
int word_ending =
(char_choice_index == char_choices.length() - 1) ? true : false;
// Deal with fragments.
CHAR_FRAGMENT_INFO char_frag_info;
if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
blob_choice.certainty(), prev_char_frag_info, debug,
word_ending, &char_frag_info)) {
return; // blob_choice must be an invalid fragment
}
// Search the next letter if this character is a fragment.
if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
permute_choices(debug, char_choices, char_choice_index + 1,
&char_frag_info, word, certainties, limit,
best_choice, more_args);
return;
}
// Add the next unichar.
float old_rating = word->rating();
float old_certainty = word->certainty();
uinT8 old_permuter = word->permuter();
certainties[word->length()] = char_frag_info.certainty;
word->append_unichar_id_space_allocated(
char_frag_info.unichar_id, char_frag_info.num_fragments,
char_frag_info.rating, char_frag_info.certainty);
// Explore the next unichar.
(this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
&char_frag_info, word_ending, word, certainties,
limit, best_choice, more_args);
// Remove the unichar we added to explore other choices in it's place.
word->remove_last_unichar_id();
word->set_rating(old_rating);
word->set_certainty(old_certainty);
word->set_permuter(old_permuter);
}
/**********************************************************************
* go_deeper_top_fragments_fxn
*
* If the choice being composed so far could be better
* than best_choice keep exploring choices.
**********************************************************************/
void Dict::go_deeper_top_fragments_fxn(
const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
int char_choice_index,
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
bool word_ending, WERD_CHOICE *word, float certainties[],
float *limit, WERD_CHOICE *best_choice, void *more_args) {
if (word->rating() < *limit) {
if (word_ending) {
if (fragments_debug > 1) {
tprintf("fragments_debug new choice = %s\n",
word->debug_string(getUnicharset()).string());
}
*limit = word->rating();
float adjust_factor;
adjust_non_word(word, &adjust_factor);
LogNewChoice(*word, adjust_factor, certainties, false);
if (word->rating() < best_choice->rating()) {
*best_choice = *word;
}
} else { // search the next letter
permute_choices(debug, char_choices, char_choice_index + 1,
prev_char_frag_info, word, certainties, limit,
best_choice, more_args);
}
} else {
if (fragments_debug > 1) {
tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
word->debug_string(getUnicharset()).string(),
word->rating(), *limit);
}
}
}
} // namespace tesseract