Abolish populate_unichars(), fixing seg fault reported in Debian:

http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658634



git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@675 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
david.eger@gmail.com 2012-02-15 01:37:00 +00:00
parent 53d133d83a
commit 018f192fc2
15 changed files with 15 additions and 97 deletions

View File

@ -249,7 +249,6 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
word_choice->append_unichar_id(bc->unichar_id(), 1,
bc->rating(), bc->certainty());
}
word_choice->populate_unichars();
}
// Tests the chopper by exhaustively running chop_one_blob.
@ -776,7 +775,6 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
}
choice->populate_unichars();
if (word_res->best_choice != NULL)
delete word_res->best_choice;
word_res->best_choice = choice;

View File

@ -420,7 +420,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
WERD_RES *w_prev = NULL;
WERD_RES *w = word_it.word();
if (w && w->best_choice) w->best_choice->populate_unichars();
while (1) {
w_prev = w;
while (word_it.forward() != NULL &&
@ -429,8 +428,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
}
if (!word_it.word()) break;
w = word_it.word();
if (w && w->best_choice)
w->best_choice->populate_unichars();
if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
continue;
}
@ -490,11 +487,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
}
if (tessedit_bigram_debug > 1) {
if (w_prev->alt_choices.size() > 1) {
print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices,
false);
print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
}
if (w->alt_choices.size() > 1) {
print_word_alternates_list(w->best_choice, &w->alt_choices, false);
print_word_alternates_list(w->best_choice, &w->alt_choices);
}
}
float best_rating = 0.0;
@ -1244,7 +1240,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
} else {
// Just correct existing classification.
CorrectRepcharChoices(best_choice, word_res);
word_res->best_choice->populate_unichars();
word_res->reject_map.initialise(word.length());
}
}

View File

@ -187,7 +187,6 @@ static WERD_CHOICE *create_werd_choice(
// Add list to the clist
blob_choices_it.add_to_end(choices_list);
}
werd_choice->populate_unichars();
werd_choice->set_certainty(certainty);
werd_choice->set_blob_choices(blob_choices);
return werd_choice;

View File

@ -684,9 +684,6 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
word_res->reject_map[i].setrej_unlv_rej ();
}
}
if (modified) {
word_res->best_choice->populate_unichars();
}
}
GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {

View File

@ -248,7 +248,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
if (!blob_choices_it.empty()) delete blob_choices_it.extract();
}
word->best_choice->populate_unichars();
word->reject_map.remove_pos (0);
word->box_word->DeleteBox(0);
}

View File

@ -847,9 +847,6 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
}
prev_right = out_box.right();
}
if (modified) {
best_choice->populate_unichars();
}
}
// Note: After running this function word_res->best_choice->blob_choices()
@ -975,9 +972,6 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
}
}
}
if (modified) {
best_choice->populate_unichars();
}
}
BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {

View File

@ -130,7 +130,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
word->raw_choice->append_unichar_id(space_id, 1, 0.0,
word->raw_choice->certainty());
}
word->raw_choice->populate_unichars();
}
// Do sanity checks and minor fixes on best_choice.
@ -164,7 +163,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
word->best_choice->append_unichar_id(space_id, 1, 0.0,
word->best_choice->certainty());
}
word->best_choice->populate_unichars();
}
}

View File

@ -563,7 +563,6 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
bc_it.add_after_then_move(choice_list);
}
best_choice->set_blob_choices(word_choices);
best_choice->populate_unichars();
delete raw_choice;
raw_choice = new WERD_CHOICE(*best_choice);
reject_map.initialise(blob_count);
@ -633,10 +632,6 @@ bool WERD_RES::ConditionalBlobMerge(
}
delete class_cb;
delete box_cb;
if (modified) {
best_choice->populate_unichars();
raw_choice->populate_unichars();
}
return modified;
}

View File

@ -223,8 +223,6 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
* reverse_and_mirror_unichar_ids
*
* Reverses and mirrors unichars in unichar_ids.
* Note: this function does not change unichar_string_, it only modifies
* unichar_ids array.
*/
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
for (int i = 0; i < length_/2; ++i) {
@ -358,8 +356,6 @@ WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
second.permuter() != permuter_) {
permuter_ = COMPOUND_PERM;
}
unichar_string_ += second.unichar_string();
unichar_lengths_ += second.unichar_lengths();
// Append a deep copy of second blob_choices if it exists.
if (second.blob_choices_ != NULL) {
@ -412,8 +408,6 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
certainty_ = source.certainty();
permuter_ = source.permuter();
fragment_mark_ = source.fragment_mark();
unichar_string_ = source.unichar_string();
unichar_lengths_ = source.unichar_lengths();
// Delete existing blob_choices
this->delete_blob_choices();
@ -633,15 +627,8 @@ void print_char_choices_list(const char *msg,
*/
void print_word_alternates_list(
WERD_CHOICE *word,
GenericVector<WERD_CHOICE *> *alternates,
bool needs_populate_unichars) {
GenericVector<WERD_CHOICE *> *alternates) {
if (!word || !alternates) return;
if (needs_populate_unichars) {
word->populate_unichars();
for (int i = 0; i < alternates->size(); ++i) {
alternates->get(i)->populate_unichars();
}
}
STRING alternates_str;
for (int i = 0; i < alternates->size(); i++) {

View File

@ -297,8 +297,6 @@ class WERD_CHOICE {
fragment_mark_ = false;
blob_choices_ = NULL;
unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
unichar_string_ = "";
unichar_lengths_ = "";
}
/// Helper function to build a WERD_CHOICE from the given string,
@ -316,8 +314,6 @@ class WERD_CHOICE {
rating_ = kBadRating;
certainty_ = -MAX_FLOAT32;
fragment_mark_ = false;
unichar_string_ = "";
unichar_lengths_ = "";
}
/// This function assumes that there is enough space reserved
@ -373,19 +369,6 @@ class WERD_CHOICE {
}
return word_str;
}
/// Since this function walks over the whole word to convert unichar ids
/// to unichars, it is best to call it once, e.g. after all changes to
/// unichar_ids_ in WERD_CHOICE are finished.
void populate_unichars() {
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
}
/// Undoes populate_unichars, so that unichar_string_ and unichar_lengths_
/// are empty.
void depopulate_unichars() {
unichar_string_ = "";
unichar_lengths_ = "";
}
// Call this to override the default (strict left to right graphemes)
// with the fact that some engine produces a "reading order" set of
@ -398,19 +381,17 @@ class WERD_CHOICE {
return unichars_in_script_order_;
}
/// This function should only be called if populate_unichars()
/// was called and WERD_CHOICE did not change since then.
// Returns a UTF-8 string equivalent to the current choice
// of UNICHAR IDs.
const STRING &unichar_string() const {
assert(unichar_string_.length() <= 0 ||
unichar_string_.length() >= length_); // sanity check
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
return unichar_string_;
}
/// This function should only be called if populate_unichars()
/// was called and WERD_CHOICE did not change since then.
// Returns the lengths, one byte each, representing the number of bytes
// required in the unichar_string for each UNICHAR_ID.
const STRING &unichar_lengths() const {
assert(unichar_lengths_.length() <= 0 ||
unichar_lengths_.length() == length_); // sanity check
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
return unichar_lengths_;
}
const void print() const { this->print(""); }
@ -441,10 +422,10 @@ class WERD_CHOICE {
// (for Arabic, that is right-to-left).
bool unichars_in_script_order_;
// The following variables are only populated by calling populate_unichars().
// They are not synchronized with the values in unichar_ids otherwise.
STRING unichar_string_;
STRING unichar_lengths_;
// The following variables are populated and passed by reference any
// time unichar_string() or unichar_lengths() are called.
mutable STRING unichar_string_;
mutable STRING unichar_lengths_;
bool unichar_info_present;
@ -484,7 +465,6 @@ void print_char_choices_list(
);
void print_word_alternates_list(
WERD_CHOICE *word,
GenericVector<WERD_CHOICE *> *alternates,
bool needs_populate_unichars);
GenericVector<WERD_CHOICE *> *alternates);
#endif

View File

@ -59,7 +59,6 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
// Remove the last unichar id as it is a hyphen, and remove
// any unichar_string/lengths that are present.
hyphen_word_->remove_last_unichar_id();
hyphen_word_->depopulate_unichars();
hyphen_active_dawgs_ = active_dawgs;
hyphen_constraints_ = constraints;
}

View File

@ -464,7 +464,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
}
if (part_choice && step > 1) { // found lexicon match
part_choice->populate_unichars();
get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
if (permuter_state)
@ -496,7 +495,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
best_choice->rating(), match_score, adjusted_score);
best_choice->set_rating(adjusted_score);
}
best_choice->populate_unichars();
if (permute_debug)
tprintf("Found Best CJK word %f: %s\n",
best_choice->rating(), best_choice->unichar_string().string());
@ -649,7 +647,6 @@ WERD_CHOICE* Dict::permute_chartype_words(
// All permuter choices should go through adjust_non_word so the choice
// rating would be adjusted on the same scale.
adjust_non_word(current_word, certainties, permute_debug);
current_word->populate_unichars();
if (replaced) {
// Apply a reward multiplier on rating if an chartype permutation is made.
float rating = current_word->rating();
@ -748,7 +745,6 @@ WERD_CHOICE* Dict::permute_script_words(
// All permuter choices should go through adjust_non_word so the choice
// rating would be adjusted on the same scale.
adjust_non_word(current_word, certainties, permute_debug);
current_word->populate_unichars();
if (replaced) {
// Apply a reward multiplier on rating if an script permutation is made.
float rating = current_word->rating();
@ -769,7 +765,6 @@ WERD_CHOICE* Dict::permute_script_words(
bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
WERD_CHOICE *best_choice,
WERD_CHOICE *raw_choice) {
float old_raw_choice_rating = raw_choice->rating();
if (permute_debug) {
tprintf("\n\n\n##### Permute_Characters #######\n");
print_char_choices_list("\n==> Input CharChoices", char_choices,
@ -781,18 +776,8 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
get_top_choice_uid(char_choices.get(0)) == 0) return false;
WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);
if (raw_choice->rating() < old_raw_choice_rating) {
// Populate unichars_ and unichar_lengths_ of raw_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (e.g. LearnWord).
raw_choice->populate_unichars();
}
if (this_choice && this_choice->rating() < best_choice->rating()) {
*best_choice = *this_choice;
// Populate unichars_ and unichar_lengths_ of best_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (dawg, *_ok functions, various hard-coded hacks).
best_choice->populate_unichars();
if (permute_debug) {
best_choice->print("\n**** Populate BestChoice");
@ -914,8 +899,6 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
current_word->debug_string().string(),
current_word->rating(), current_word->certainty());
}
current_word->populate_unichars();
EnableChoiceAccum();
}

View File

@ -703,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
} // end searching AmbigSpec_LIST
} // end searching best_choice
} // end searching replace and dangerous ambigs
if (modified_best_choice) best_choice->populate_unichars();
// If any ambiguities were found permute the constructed ambig_blob_choices
// to see if an alternative dictionary word can be found.
if (ambigs_found) {

View File

@ -190,9 +190,6 @@ void LanguageModel::InitForWord(
// Fill prev_word_str_ with the last language_model_ngram_order
// unichars from prev_word.
// Assume that populate_unichars() has been called on a valid prev_word,
// which is the case, since it points to the final result of the
// classification of the previous word.
if (language_model_ngram_on) {
if (prev_word != NULL && prev_word->unichar_string() != NULL) {
prev_word_str_ = prev_word->unichar_string();
@ -1234,7 +1231,6 @@ void LanguageModel::UpdateBestChoice(
dict_->LogNewChoice(1.0, certainties, true, word);
*(best_choice_bundle->raw_choice) = *word;
best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
best_choice_bundle->raw_choice->populate_unichars();
if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
}
@ -1278,7 +1274,6 @@ void LanguageModel::UpdateBestChoice(
}
// Update best_choice_bundle.
*(best_choice_bundle->best_choice) = *word;
best_choice_bundle->best_choice->populate_unichars();
best_choice_bundle->updated = true;
best_choice_bundle->best_char_choices->delete_data_pointers();
best_choice_bundle->best_char_choices->clear();

View File

@ -189,7 +189,6 @@ void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
alt_choice->set_rating(choice->Rating);
alt_choice->set_certainty(choice->Certainty);
word->alt_choices.push_back(alt_choice);
alt_choice->populate_unichars();
if (wordrec_debug_level > 0) {
tprintf("SaveAltChoices: %s %g\n",
alt_choice->unichar_string().string(), alt_choice->rating());