From 018f192fc22283bb652b85c792b5e01767c61730 Mon Sep 17 00:00:00 2001 From: "david.eger@gmail.com" Date: Wed, 15 Feb 2012 01:37:00 +0000 Subject: [PATCH] Abolish populate_unichars(), fixing seg fault reported in Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658634 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@675 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- ccmain/applybox.cpp | 2 -- ccmain/control.cpp | 9 ++------ ccmain/cube_control.cpp | 1 - ccmain/docqual.cpp | 3 --- ccmain/output.cpp | 1 - ccmain/reject.cpp | 6 ------ ccmain/tfacepp.cpp | 2 -- ccstruct/pageres.cpp | 5 ----- ccstruct/ratngs.cpp | 15 +------------- ccstruct/ratngs.h | 42 ++++++++++---------------------------- dict/hyphen.cpp | 1 - dict/permute.cpp | 17 --------------- dict/stopper.cpp | 2 +- wordrec/language_model.cpp | 5 ----- wordrec/wordrec.cpp | 1 - 15 files changed, 15 insertions(+), 97 deletions(-) diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index 2d183eb3..4ac3d396 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -249,7 +249,6 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices, word_choice->append_unichar_id(bc->unichar_id(), 1, bc->rating(), bc->certainty()); } - word_choice->populate_unichars(); } // Tests the chopper by exhaustively running chop_one_blob. @@ -776,7 +775,6 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f); } - choice->populate_unichars(); if (word_res->best_choice != NULL) delete word_res->best_choice; word_res->best_choice = choice; diff --git a/ccmain/control.cpp b/ccmain/control.cpp index a28b9a00..2901f4e7 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -420,7 +420,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { WERD_RES *w_prev = NULL; WERD_RES *w = word_it.word(); - if (w && w->best_choice) w->best_choice->populate_unichars(); while (1) { w_prev = w; while (word_it.forward() != NULL && @@ -429,8 +428,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { } if (!word_it.word()) break; w = word_it.word(); - if (w && w->best_choice) - w->best_choice->populate_unichars(); if (!w || !w_prev || w->uch_set != w_prev->uch_set) { continue; } @@ -490,11 +487,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { } if (tessedit_bigram_debug > 1) { if (w_prev->alt_choices.size() > 1) { - print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices, - false); + print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices); } if (w->alt_choices.size() > 1) { - print_word_alternates_list(w->best_choice, &w->alt_choices, false); + print_word_alternates_list(w->best_choice, &w->alt_choices); } } float best_rating = 0.0; @@ -1244,7 +1240,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { } else { // Just correct existing classification. CorrectRepcharChoices(best_choice, word_res); - word_res->best_choice->populate_unichars(); word_res->reject_map.initialise(word.length()); } } diff --git a/ccmain/cube_control.cpp b/ccmain/cube_control.cpp index cd5cc741..5b222a12 100644 --- a/ccmain/cube_control.cpp +++ b/ccmain/cube_control.cpp @@ -187,7 +187,6 @@ static WERD_CHOICE *create_werd_choice( // Add list to the clist blob_choices_it.add_to_end(choices_list); } - werd_choice->populate_unichars(); werd_choice->set_certainty(certainty); werd_choice->set_blob_choices(blob_choices); return werd_choice; diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp index c8fed20f..4d9ce4a3 100644 --- a/ccmain/docqual.cpp +++ b/ccmain/docqual.cpp @@ -684,9 +684,6 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { word_res->reject_map[i].setrej_unlv_rej (); } } - if (modified) { - word_res->best_choice->populate_unichars(); - } } GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { diff --git a/ccmain/output.cpp b/ccmain/output.cpp index 95ed7214..3081874e 100644 --- a/ccmain/output.cpp +++ b/ccmain/output.cpp @@ -248,7 +248,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); if (!blob_choices_it.empty()) delete blob_choices_it.extract(); } - word->best_choice->populate_unichars(); word->reject_map.remove_pos (0); word->box_word->DeleteBox(0); } diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp index 94497bfa..d9ef14c6 100644 --- a/ccmain/reject.cpp +++ b/ccmain/reject.cpp @@ -847,9 +847,6 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) { } prev_right = out_box.right(); } - if (modified) { - best_choice->populate_unichars(); - } } // Note: After running this function word_res->best_choice->blob_choices() @@ -975,9 +972,6 @@ void Tesseract::flip_0O(WERD_RES *word_res) { } } } - if (modified) { - best_choice->populate_unichars(); - } } BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { diff --git a/ccmain/tfacepp.cpp b/ccmain/tfacepp.cpp index d8459328..a3eeaaea 100644 --- a/ccmain/tfacepp.cpp +++ b/ccmain/tfacepp.cpp @@ -130,7 +130,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word, word->raw_choice->append_unichar_id(space_id, 1, 0.0, word->raw_choice->certainty()); } - word->raw_choice->populate_unichars(); } // Do sanity checks and minor fixes on best_choice. @@ -164,7 +163,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word, word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty()); } - word->best_choice->populate_unichars(); } } diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 1ce7c502..6696ce02 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -563,7 +563,6 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) { bc_it.add_after_then_move(choice_list); } best_choice->set_blob_choices(word_choices); - best_choice->populate_unichars(); delete raw_choice; raw_choice = new WERD_CHOICE(*best_choice); reject_map.initialise(blob_count); @@ -633,10 +632,6 @@ bool WERD_RES::ConditionalBlobMerge( } delete class_cb; delete box_cb; - if (modified) { - best_choice->populate_unichars(); - raw_choice->populate_unichars(); - } return modified; } diff --git a/ccstruct/ratngs.cpp b/ccstruct/ratngs.cpp index 02855851..0ba60ceb 100644 --- a/ccstruct/ratngs.cpp +++ b/ccstruct/ratngs.cpp @@ -223,8 +223,6 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) { * reverse_and_mirror_unichar_ids * * Reverses and mirrors unichars in unichar_ids. - * Note: this function does not change unichar_string_, it only modifies - * unichar_ids array. */ void WERD_CHOICE::reverse_and_mirror_unichar_ids() { for (int i = 0; i < length_/2; ++i) { @@ -358,8 +356,6 @@ WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) { second.permuter() != permuter_) { permuter_ = COMPOUND_PERM; } - unichar_string_ += second.unichar_string(); - unichar_lengths_ += second.unichar_lengths(); // Append a deep copy of second blob_choices if it exists. if (second.blob_choices_ != NULL) { @@ -412,8 +408,6 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) { certainty_ = source.certainty(); permuter_ = source.permuter(); fragment_mark_ = source.fragment_mark(); - unichar_string_ = source.unichar_string(); - unichar_lengths_ = source.unichar_lengths(); // Delete existing blob_choices this->delete_blob_choices(); @@ -633,15 +627,8 @@ void print_char_choices_list(const char *msg, */ void print_word_alternates_list( WERD_CHOICE *word, - GenericVector *alternates, - bool needs_populate_unichars) { + GenericVector *alternates) { if (!word || !alternates) return; - if (needs_populate_unichars) { - word->populate_unichars(); - for (int i = 0; i < alternates->size(); ++i) { - alternates->get(i)->populate_unichars(); - } - } STRING alternates_str; for (int i = 0; i < alternates->size(); i++) { diff --git a/ccstruct/ratngs.h b/ccstruct/ratngs.h index 4145aa0c..481847ec 100644 --- a/ccstruct/ratngs.h +++ b/ccstruct/ratngs.h @@ -297,8 +297,6 @@ class WERD_CHOICE { fragment_mark_ = false; blob_choices_ = NULL; unichars_in_script_order_ = false; // Tesseract is strict left-to-right. - unichar_string_ = ""; - unichar_lengths_ = ""; } /// Helper function to build a WERD_CHOICE from the given string, @@ -316,8 +314,6 @@ class WERD_CHOICE { rating_ = kBadRating; certainty_ = -MAX_FLOAT32; fragment_mark_ = false; - unichar_string_ = ""; - unichar_lengths_ = ""; } /// This function assumes that there is enough space reserved @@ -373,19 +369,6 @@ class WERD_CHOICE { } return word_str; } - /// Since this function walks over the whole word to convert unichar ids - /// to unichars, it is best to call it once, e.g. after all changes to - /// unichar_ids_ in WERD_CHOICE are finished. - void populate_unichars() { - this->string_and_lengths(&unichar_string_, &unichar_lengths_); - } - - /// Undoes populate_unichars, so that unichar_string_ and unichar_lengths_ - /// are empty. - void depopulate_unichars() { - unichar_string_ = ""; - unichar_lengths_ = ""; - } // Call this to override the default (strict left to right graphemes) // with the fact that some engine produces a "reading order" set of @@ -398,19 +381,17 @@ class WERD_CHOICE { return unichars_in_script_order_; } - /// This function should only be called if populate_unichars() - /// was called and WERD_CHOICE did not change since then. + // Returns a UTF-8 string equivalent to the current choice + // of UNICHAR IDs. const STRING &unichar_string() const { - assert(unichar_string_.length() <= 0 || - unichar_string_.length() >= length_); // sanity check + this->string_and_lengths(&unichar_string_, &unichar_lengths_); return unichar_string_; } - /// This function should only be called if populate_unichars() - /// was called and WERD_CHOICE did not change since then. + // Returns the lengths, one byte each, representing the number of bytes + // required in the unichar_string for each UNICHAR_ID. const STRING &unichar_lengths() const { - assert(unichar_lengths_.length() <= 0 || - unichar_lengths_.length() == length_); // sanity check + this->string_and_lengths(&unichar_string_, &unichar_lengths_); return unichar_lengths_; } const void print() const { this->print(""); } @@ -441,10 +422,10 @@ class WERD_CHOICE { // (for Arabic, that is right-to-left). bool unichars_in_script_order_; - // The following variables are only populated by calling populate_unichars(). - // They are not synchronized with the values in unichar_ids otherwise. - STRING unichar_string_; - STRING unichar_lengths_; + // The following variables are populated and passed by reference any + // time unichar_string() or unichar_lengths() are called. + mutable STRING unichar_string_; + mutable STRING unichar_lengths_; bool unichar_info_present; @@ -484,7 +465,6 @@ void print_char_choices_list( ); void print_word_alternates_list( WERD_CHOICE *word, - GenericVector *alternates, - bool needs_populate_unichars); + GenericVector *alternates); #endif diff --git a/dict/hyphen.cpp b/dict/hyphen.cpp index 1f39afdc..9f4296e8 100644 --- a/dict/hyphen.cpp +++ b/dict/hyphen.cpp @@ -59,7 +59,6 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word, // Remove the last unichar id as it is a hyphen, and remove // any unichar_string/lengths that are present. hyphen_word_->remove_last_unichar_id(); - hyphen_word_->depopulate_unichars(); hyphen_active_dawgs_ = active_dawgs; hyphen_constraints_ = constraints; } diff --git a/dict/permute.cpp b/dict/permute.cpp index e4111669..d0fc13f3 100644 --- a/dict/permute.cpp +++ b/dict/permute.cpp @@ -464,7 +464,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words( } if (part_choice && step > 1) { // found lexicon match - part_choice->populate_unichars(); get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr); float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length()); if (permuter_state) @@ -496,7 +495,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words( best_choice->rating(), match_score, adjusted_score); best_choice->set_rating(adjusted_score); } - best_choice->populate_unichars(); if (permute_debug) tprintf("Found Best CJK word %f: %s\n", best_choice->rating(), best_choice->unichar_string().string()); @@ -649,7 +647,6 @@ WERD_CHOICE* Dict::permute_chartype_words( // All permuter choices should go through adjust_non_word so the choice // rating would be adjusted on the same scale. adjust_non_word(current_word, certainties, permute_debug); - current_word->populate_unichars(); if (replaced) { // Apply a reward multiplier on rating if an chartype permutation is made. float rating = current_word->rating(); @@ -748,7 +745,6 @@ WERD_CHOICE* Dict::permute_script_words( // All permuter choices should go through adjust_non_word so the choice // rating would be adjusted on the same scale. adjust_non_word(current_word, certainties, permute_debug); - current_word->populate_unichars(); if (replaced) { // Apply a reward multiplier on rating if an script permutation is made. float rating = current_word->rating(); @@ -769,7 +765,6 @@ WERD_CHOICE* Dict::permute_script_words( bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice) { - float old_raw_choice_rating = raw_choice->rating(); if (permute_debug) { tprintf("\n\n\n##### Permute_Characters #######\n"); print_char_choices_list("\n==> Input CharChoices", char_choices, @@ -781,18 +776,8 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, get_top_choice_uid(char_choices.get(0)) == 0) return false; WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice); - if (raw_choice->rating() < old_raw_choice_rating) { - // Populate unichars_ and unichar_lengths_ of raw_choice. This is - // needed for various components that still work with unichars rather - // than unichar ids (e.g. LearnWord). - raw_choice->populate_unichars(); - } if (this_choice && this_choice->rating() < best_choice->rating()) { *best_choice = *this_choice; - // Populate unichars_ and unichar_lengths_ of best_choice. This is - // needed for various components that still work with unichars rather - // than unichar ids (dawg, *_ok functions, various hard-coded hacks). - best_choice->populate_unichars(); if (permute_debug) { best_choice->print("\n**** Populate BestChoice"); @@ -914,8 +899,6 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, current_word->debug_string().string(), current_word->rating(), current_word->certainty()); } - current_word->populate_unichars(); - EnableChoiceAccum(); } diff --git a/dict/stopper.cpp b/dict/stopper.cpp index 01d99f09..75a9657d 100644 --- a/dict/stopper.cpp +++ b/dict/stopper.cpp @@ -703,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, } // end searching AmbigSpec_LIST } // end searching best_choice } // end searching replace and dangerous ambigs - if (modified_best_choice) best_choice->populate_unichars(); + // If any ambiguities were found permute the constructed ambig_blob_choices // to see if an alternative dictionary word can be found. if (ambigs_found) { diff --git a/wordrec/language_model.cpp b/wordrec/language_model.cpp index 712100f3..d441dab2 100644 --- a/wordrec/language_model.cpp +++ b/wordrec/language_model.cpp @@ -190,9 +190,6 @@ void LanguageModel::InitForWord( // Fill prev_word_str_ with the last language_model_ngram_order // unichars from prev_word. - // Assume that populate_unichars() has been called on a valid prev_word, - // which is the case, since it points to the final result of the - // classification of the previous word. if (language_model_ngram_on) { if (prev_word != NULL && prev_word->unichar_string() != NULL) { prev_word_str_ = prev_word->unichar_string(); @@ -1234,7 +1231,6 @@ void LanguageModel::UpdateBestChoice( dict_->LogNewChoice(1.0, certainties, true, word); *(best_choice_bundle->raw_choice) = *word; best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM); - best_choice_bundle->raw_choice->populate_unichars(); if (language_model_debug_level > 0) tprintf("Updated raw choice\n"); } @@ -1278,7 +1274,6 @@ void LanguageModel::UpdateBestChoice( } // Update best_choice_bundle. *(best_choice_bundle->best_choice) = *word; - best_choice_bundle->best_choice->populate_unichars(); best_choice_bundle->updated = true; best_choice_bundle->best_char_choices->delete_data_pointers(); best_choice_bundle->best_char_choices->clear(); diff --git a/wordrec/wordrec.cpp b/wordrec/wordrec.cpp index e258e073..ec684984 100644 --- a/wordrec/wordrec.cpp +++ b/wordrec/wordrec.cpp @@ -189,7 +189,6 @@ void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) { alt_choice->set_rating(choice->Rating); alt_choice->set_certainty(choice->Certainty); word->alt_choices.push_back(alt_choice); - alt_choice->populate_unichars(); if (wordrec_debug_level > 0) { tprintf("SaveAltChoices: %s %g\n", alt_choice->unichar_string().string(), alt_choice->rating());