mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-04 01:39:16 +08:00
Abolish populate_unichars(), fixing seg fault reported in Debian:
http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658634 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@675 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
53d133d83a
commit
018f192fc2
@ -249,7 +249,6 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
|
||||
word_choice->append_unichar_id(bc->unichar_id(), 1,
|
||||
bc->rating(), bc->certainty());
|
||||
}
|
||||
word_choice->populate_unichars();
|
||||
}
|
||||
|
||||
// Tests the chopper by exhaustively running chop_one_blob.
|
||||
@ -776,7 +775,6 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
||||
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
|
||||
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
|
||||
}
|
||||
choice->populate_unichars();
|
||||
if (word_res->best_choice != NULL)
|
||||
delete word_res->best_choice;
|
||||
word_res->best_choice = choice;
|
||||
|
@ -420,7 +420,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
|
||||
WERD_RES *w_prev = NULL;
|
||||
WERD_RES *w = word_it.word();
|
||||
if (w && w->best_choice) w->best_choice->populate_unichars();
|
||||
while (1) {
|
||||
w_prev = w;
|
||||
while (word_it.forward() != NULL &&
|
||||
@ -429,8 +428,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
}
|
||||
if (!word_it.word()) break;
|
||||
w = word_it.word();
|
||||
if (w && w->best_choice)
|
||||
w->best_choice->populate_unichars();
|
||||
if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
|
||||
continue;
|
||||
}
|
||||
@ -490,11 +487,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
}
|
||||
if (tessedit_bigram_debug > 1) {
|
||||
if (w_prev->alt_choices.size() > 1) {
|
||||
print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices,
|
||||
false);
|
||||
print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
|
||||
}
|
||||
if (w->alt_choices.size() > 1) {
|
||||
print_word_alternates_list(w->best_choice, &w->alt_choices, false);
|
||||
print_word_alternates_list(w->best_choice, &w->alt_choices);
|
||||
}
|
||||
}
|
||||
float best_rating = 0.0;
|
||||
@ -1244,7 +1240,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
|
||||
} else {
|
||||
// Just correct existing classification.
|
||||
CorrectRepcharChoices(best_choice, word_res);
|
||||
word_res->best_choice->populate_unichars();
|
||||
word_res->reject_map.initialise(word.length());
|
||||
}
|
||||
}
|
||||
|
@ -187,7 +187,6 @@ static WERD_CHOICE *create_werd_choice(
|
||||
// Add list to the clist
|
||||
blob_choices_it.add_to_end(choices_list);
|
||||
}
|
||||
werd_choice->populate_unichars();
|
||||
werd_choice->set_certainty(certainty);
|
||||
werd_choice->set_blob_choices(blob_choices);
|
||||
return werd_choice;
|
||||
|
@ -684,9 +684,6 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
|
||||
word_res->reject_map[i].setrej_unlv_rej ();
|
||||
}
|
||||
}
|
||||
if (modified) {
|
||||
word_res->best_choice->populate_unichars();
|
||||
}
|
||||
}
|
||||
|
||||
GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
|
||||
|
@ -248,7 +248,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
|
||||
if (!blob_choices_it.empty()) delete blob_choices_it.extract();
|
||||
}
|
||||
word->best_choice->populate_unichars();
|
||||
word->reject_map.remove_pos (0);
|
||||
word->box_word->DeleteBox(0);
|
||||
}
|
||||
|
@ -847,9 +847,6 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||
}
|
||||
prev_right = out_box.right();
|
||||
}
|
||||
if (modified) {
|
||||
best_choice->populate_unichars();
|
||||
}
|
||||
}
|
||||
|
||||
// Note: After running this function word_res->best_choice->blob_choices()
|
||||
@ -975,9 +972,6 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (modified) {
|
||||
best_choice->populate_unichars();
|
||||
}
|
||||
}
|
||||
|
||||
BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
|
||||
|
@ -130,7 +130,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
|
||||
word->raw_choice->append_unichar_id(space_id, 1, 0.0,
|
||||
word->raw_choice->certainty());
|
||||
}
|
||||
word->raw_choice->populate_unichars();
|
||||
}
|
||||
|
||||
// Do sanity checks and minor fixes on best_choice.
|
||||
@ -164,7 +163,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
|
||||
word->best_choice->append_unichar_id(space_id, 1, 0.0,
|
||||
word->best_choice->certainty());
|
||||
}
|
||||
word->best_choice->populate_unichars();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -563,7 +563,6 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
|
||||
bc_it.add_after_then_move(choice_list);
|
||||
}
|
||||
best_choice->set_blob_choices(word_choices);
|
||||
best_choice->populate_unichars();
|
||||
delete raw_choice;
|
||||
raw_choice = new WERD_CHOICE(*best_choice);
|
||||
reject_map.initialise(blob_count);
|
||||
@ -633,10 +632,6 @@ bool WERD_RES::ConditionalBlobMerge(
|
||||
}
|
||||
delete class_cb;
|
||||
delete box_cb;
|
||||
if (modified) {
|
||||
best_choice->populate_unichars();
|
||||
raw_choice->populate_unichars();
|
||||
}
|
||||
return modified;
|
||||
}
|
||||
|
||||
|
@ -223,8 +223,6 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
|
||||
* reverse_and_mirror_unichar_ids
|
||||
*
|
||||
* Reverses and mirrors unichars in unichar_ids.
|
||||
* Note: this function does not change unichar_string_, it only modifies
|
||||
* unichar_ids array.
|
||||
*/
|
||||
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
|
||||
for (int i = 0; i < length_/2; ++i) {
|
||||
@ -358,8 +356,6 @@ WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
|
||||
second.permuter() != permuter_) {
|
||||
permuter_ = COMPOUND_PERM;
|
||||
}
|
||||
unichar_string_ += second.unichar_string();
|
||||
unichar_lengths_ += second.unichar_lengths();
|
||||
|
||||
// Append a deep copy of second blob_choices if it exists.
|
||||
if (second.blob_choices_ != NULL) {
|
||||
@ -412,8 +408,6 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
|
||||
certainty_ = source.certainty();
|
||||
permuter_ = source.permuter();
|
||||
fragment_mark_ = source.fragment_mark();
|
||||
unichar_string_ = source.unichar_string();
|
||||
unichar_lengths_ = source.unichar_lengths();
|
||||
|
||||
// Delete existing blob_choices
|
||||
this->delete_blob_choices();
|
||||
@ -633,15 +627,8 @@ void print_char_choices_list(const char *msg,
|
||||
*/
|
||||
void print_word_alternates_list(
|
||||
WERD_CHOICE *word,
|
||||
GenericVector<WERD_CHOICE *> *alternates,
|
||||
bool needs_populate_unichars) {
|
||||
GenericVector<WERD_CHOICE *> *alternates) {
|
||||
if (!word || !alternates) return;
|
||||
if (needs_populate_unichars) {
|
||||
word->populate_unichars();
|
||||
for (int i = 0; i < alternates->size(); ++i) {
|
||||
alternates->get(i)->populate_unichars();
|
||||
}
|
||||
}
|
||||
|
||||
STRING alternates_str;
|
||||
for (int i = 0; i < alternates->size(); i++) {
|
||||
|
@ -297,8 +297,6 @@ class WERD_CHOICE {
|
||||
fragment_mark_ = false;
|
||||
blob_choices_ = NULL;
|
||||
unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
|
||||
unichar_string_ = "";
|
||||
unichar_lengths_ = "";
|
||||
}
|
||||
|
||||
/// Helper function to build a WERD_CHOICE from the given string,
|
||||
@ -316,8 +314,6 @@ class WERD_CHOICE {
|
||||
rating_ = kBadRating;
|
||||
certainty_ = -MAX_FLOAT32;
|
||||
fragment_mark_ = false;
|
||||
unichar_string_ = "";
|
||||
unichar_lengths_ = "";
|
||||
}
|
||||
|
||||
/// This function assumes that there is enough space reserved
|
||||
@ -373,19 +369,6 @@ class WERD_CHOICE {
|
||||
}
|
||||
return word_str;
|
||||
}
|
||||
/// Since this function walks over the whole word to convert unichar ids
|
||||
/// to unichars, it is best to call it once, e.g. after all changes to
|
||||
/// unichar_ids_ in WERD_CHOICE are finished.
|
||||
void populate_unichars() {
|
||||
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
||||
}
|
||||
|
||||
/// Undoes populate_unichars, so that unichar_string_ and unichar_lengths_
|
||||
/// are empty.
|
||||
void depopulate_unichars() {
|
||||
unichar_string_ = "";
|
||||
unichar_lengths_ = "";
|
||||
}
|
||||
|
||||
// Call this to override the default (strict left to right graphemes)
|
||||
// with the fact that some engine produces a "reading order" set of
|
||||
@ -398,19 +381,17 @@ class WERD_CHOICE {
|
||||
return unichars_in_script_order_;
|
||||
}
|
||||
|
||||
/// This function should only be called if populate_unichars()
|
||||
/// was called and WERD_CHOICE did not change since then.
|
||||
// Returns a UTF-8 string equivalent to the current choice
|
||||
// of UNICHAR IDs.
|
||||
const STRING &unichar_string() const {
|
||||
assert(unichar_string_.length() <= 0 ||
|
||||
unichar_string_.length() >= length_); // sanity check
|
||||
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
||||
return unichar_string_;
|
||||
}
|
||||
|
||||
/// This function should only be called if populate_unichars()
|
||||
/// was called and WERD_CHOICE did not change since then.
|
||||
// Returns the lengths, one byte each, representing the number of bytes
|
||||
// required in the unichar_string for each UNICHAR_ID.
|
||||
const STRING &unichar_lengths() const {
|
||||
assert(unichar_lengths_.length() <= 0 ||
|
||||
unichar_lengths_.length() == length_); // sanity check
|
||||
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
||||
return unichar_lengths_;
|
||||
}
|
||||
const void print() const { this->print(""); }
|
||||
@ -441,10 +422,10 @@ class WERD_CHOICE {
|
||||
// (for Arabic, that is right-to-left).
|
||||
bool unichars_in_script_order_;
|
||||
|
||||
// The following variables are only populated by calling populate_unichars().
|
||||
// They are not synchronized with the values in unichar_ids otherwise.
|
||||
STRING unichar_string_;
|
||||
STRING unichar_lengths_;
|
||||
// The following variables are populated and passed by reference any
|
||||
// time unichar_string() or unichar_lengths() are called.
|
||||
mutable STRING unichar_string_;
|
||||
mutable STRING unichar_lengths_;
|
||||
|
||||
bool unichar_info_present;
|
||||
|
||||
@ -484,7 +465,6 @@ void print_char_choices_list(
|
||||
);
|
||||
void print_word_alternates_list(
|
||||
WERD_CHOICE *word,
|
||||
GenericVector<WERD_CHOICE *> *alternates,
|
||||
bool needs_populate_unichars);
|
||||
GenericVector<WERD_CHOICE *> *alternates);
|
||||
|
||||
#endif
|
||||
|
@ -59,7 +59,6 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
|
||||
// Remove the last unichar id as it is a hyphen, and remove
|
||||
// any unichar_string/lengths that are present.
|
||||
hyphen_word_->remove_last_unichar_id();
|
||||
hyphen_word_->depopulate_unichars();
|
||||
hyphen_active_dawgs_ = active_dawgs;
|
||||
hyphen_constraints_ = constraints;
|
||||
}
|
||||
|
@ -464,7 +464,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
|
||||
}
|
||||
|
||||
if (part_choice && step > 1) { // found lexicon match
|
||||
part_choice->populate_unichars();
|
||||
get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
|
||||
float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
|
||||
if (permuter_state)
|
||||
@ -496,7 +495,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
|
||||
best_choice->rating(), match_score, adjusted_score);
|
||||
best_choice->set_rating(adjusted_score);
|
||||
}
|
||||
best_choice->populate_unichars();
|
||||
if (permute_debug)
|
||||
tprintf("Found Best CJK word %f: %s\n",
|
||||
best_choice->rating(), best_choice->unichar_string().string());
|
||||
@ -649,7 +647,6 @@ WERD_CHOICE* Dict::permute_chartype_words(
|
||||
// All permuter choices should go through adjust_non_word so the choice
|
||||
// rating would be adjusted on the same scale.
|
||||
adjust_non_word(current_word, certainties, permute_debug);
|
||||
current_word->populate_unichars();
|
||||
if (replaced) {
|
||||
// Apply a reward multiplier on rating if an chartype permutation is made.
|
||||
float rating = current_word->rating();
|
||||
@ -748,7 +745,6 @@ WERD_CHOICE* Dict::permute_script_words(
|
||||
// All permuter choices should go through adjust_non_word so the choice
|
||||
// rating would be adjusted on the same scale.
|
||||
adjust_non_word(current_word, certainties, permute_debug);
|
||||
current_word->populate_unichars();
|
||||
if (replaced) {
|
||||
// Apply a reward multiplier on rating if an script permutation is made.
|
||||
float rating = current_word->rating();
|
||||
@ -769,7 +765,6 @@ WERD_CHOICE* Dict::permute_script_words(
|
||||
bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
WERD_CHOICE *best_choice,
|
||||
WERD_CHOICE *raw_choice) {
|
||||
float old_raw_choice_rating = raw_choice->rating();
|
||||
if (permute_debug) {
|
||||
tprintf("\n\n\n##### Permute_Characters #######\n");
|
||||
print_char_choices_list("\n==> Input CharChoices", char_choices,
|
||||
@ -781,18 +776,8 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
get_top_choice_uid(char_choices.get(0)) == 0) return false;
|
||||
WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);
|
||||
|
||||
if (raw_choice->rating() < old_raw_choice_rating) {
|
||||
// Populate unichars_ and unichar_lengths_ of raw_choice. This is
|
||||
// needed for various components that still work with unichars rather
|
||||
// than unichar ids (e.g. LearnWord).
|
||||
raw_choice->populate_unichars();
|
||||
}
|
||||
if (this_choice && this_choice->rating() < best_choice->rating()) {
|
||||
*best_choice = *this_choice;
|
||||
// Populate unichars_ and unichar_lengths_ of best_choice. This is
|
||||
// needed for various components that still work with unichars rather
|
||||
// than unichar ids (dawg, *_ok functions, various hard-coded hacks).
|
||||
best_choice->populate_unichars();
|
||||
|
||||
if (permute_debug) {
|
||||
best_choice->print("\n**** Populate BestChoice");
|
||||
@ -914,8 +899,6 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
current_word->debug_string().string(),
|
||||
current_word->rating(), current_word->certainty());
|
||||
}
|
||||
current_word->populate_unichars();
|
||||
|
||||
EnableChoiceAccum();
|
||||
}
|
||||
|
||||
|
@ -703,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
||||
} // end searching AmbigSpec_LIST
|
||||
} // end searching best_choice
|
||||
} // end searching replace and dangerous ambigs
|
||||
if (modified_best_choice) best_choice->populate_unichars();
|
||||
|
||||
// If any ambiguities were found permute the constructed ambig_blob_choices
|
||||
// to see if an alternative dictionary word can be found.
|
||||
if (ambigs_found) {
|
||||
|
@ -190,9 +190,6 @@ void LanguageModel::InitForWord(
|
||||
|
||||
// Fill prev_word_str_ with the last language_model_ngram_order
|
||||
// unichars from prev_word.
|
||||
// Assume that populate_unichars() has been called on a valid prev_word,
|
||||
// which is the case, since it points to the final result of the
|
||||
// classification of the previous word.
|
||||
if (language_model_ngram_on) {
|
||||
if (prev_word != NULL && prev_word->unichar_string() != NULL) {
|
||||
prev_word_str_ = prev_word->unichar_string();
|
||||
@ -1234,7 +1231,6 @@ void LanguageModel::UpdateBestChoice(
|
||||
dict_->LogNewChoice(1.0, certainties, true, word);
|
||||
*(best_choice_bundle->raw_choice) = *word;
|
||||
best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
|
||||
best_choice_bundle->raw_choice->populate_unichars();
|
||||
if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
|
||||
}
|
||||
|
||||
@ -1278,7 +1274,6 @@ void LanguageModel::UpdateBestChoice(
|
||||
}
|
||||
// Update best_choice_bundle.
|
||||
*(best_choice_bundle->best_choice) = *word;
|
||||
best_choice_bundle->best_choice->populate_unichars();
|
||||
best_choice_bundle->updated = true;
|
||||
best_choice_bundle->best_char_choices->delete_data_pointers();
|
||||
best_choice_bundle->best_char_choices->clear();
|
||||
|
@ -189,7 +189,6 @@ void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
|
||||
alt_choice->set_rating(choice->Rating);
|
||||
alt_choice->set_certainty(choice->Certainty);
|
||||
word->alt_choices.push_back(alt_choice);
|
||||
alt_choice->populate_unichars();
|
||||
if (wordrec_debug_level > 0) {
|
||||
tprintf("SaveAltChoices: %s %g\n",
|
||||
alt_choice->unichar_string().string(), alt_choice->rating());
|
||||
|
Loading…
Reference in New Issue
Block a user