Abolish populate_unichars(), fixing seg fault reported in Debian:

http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658634 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@675 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2024-12-04 01:39:16 +08:00 · 2012-02-15 01:37:00 +00:00 · 2012-02-15 01:37:00 +00:00 · 018f192fc2
commit 018f192fc2
parent 53d133d83a
15 changed files with 15 additions and 97 deletions
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -249,7 +249,6 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
    word_choice->append_unichar_id(bc->unichar_id(), 1,
                                   bc->rating(), bc->certainty());
  }
-  word_choice->populate_unichars();
 }

 // Tests the chopper by exhaustively running chop_one_blob.
@ -776,7 +775,6 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
      choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
    }
-    choice->populate_unichars();
    if (word_res->best_choice != NULL)
      delete word_res->best_choice;
    word_res->best_choice = choice;
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@ -420,7 +420,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {

  WERD_RES *w_prev = NULL;
  WERD_RES *w = word_it.word();
-  if (w && w->best_choice) w->best_choice->populate_unichars();
  while (1) {
    w_prev = w;
    while (word_it.forward() != NULL &&
@ -429,8 +428,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
    }
    if (!word_it.word()) break;
    w = word_it.word();
-    if (w && w->best_choice)
-      w->best_choice->populate_unichars();
    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
      continue;
    }
@ -490,11 +487,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
    }
    if (tessedit_bigram_debug > 1) {
      if (w_prev->alt_choices.size() > 1) {
-        print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices,
-                                   false);
+        print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
      }
      if (w->alt_choices.size() > 1) {
-        print_word_alternates_list(w->best_choice, &w->alt_choices, false);
+        print_word_alternates_list(w->best_choice, &w->alt_choices);
      }
    }
    float best_rating = 0.0;
@ -1244,7 +1240,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
  } else {
    // Just correct existing classification.
    CorrectRepcharChoices(best_choice, word_res);
-    word_res->best_choice->populate_unichars();
    word_res->reject_map.initialise(word.length());
  }
 }
--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@ -187,7 +187,6 @@ static WERD_CHOICE *create_werd_choice(
    // Add list to the clist
    blob_choices_it.add_to_end(choices_list);
  }
-  werd_choice->populate_unichars();
  werd_choice->set_certainty(certainty);
  werd_choice->set_blob_choices(blob_choices);
  return werd_choice;
--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@ -684,9 +684,6 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
        word_res->reject_map[i].setrej_unlv_rej ();
    }
  }
-  if (modified) {
-    word_res->best_choice->populate_unichars();
-  }
 }

 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@ -248,7 +248,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
    }
-    word->best_choice->populate_unichars();
    word->reject_map.remove_pos (0);
    word->box_word->DeleteBox(0);
  }
--- a/ccmain/reject.cpp
+++ b/ccmain/reject.cpp
@ -847,9 +847,6 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
    }
    prev_right = out_box.right();
  }
-  if (modified) {
-    best_choice->populate_unichars();
-  }
 }

 // Note: After running this function word_res->best_choice->blob_choices()
@ -975,9 +972,6 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
      }
    }
  }
-  if (modified) {
-    best_choice->populate_unichars();
-  }
 }

 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
--- a/ccmain/tfacepp.cpp
+++ b/ccmain/tfacepp.cpp
@ -130,7 +130,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
      word->raw_choice->append_unichar_id(space_id, 1, 0.0,
                                          word->raw_choice->certainty());
    }
-    word->raw_choice->populate_unichars();
  }

  // Do sanity checks and minor fixes on best_choice.
@ -164,7 +163,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
      word->best_choice->append_unichar_id(space_id, 1, 0.0,
                                           word->best_choice->certainty());
    }
-    word->best_choice->populate_unichars();
  }
 }

--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@ -563,7 +563,6 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
    bc_it.add_after_then_move(choice_list);
  }
  best_choice->set_blob_choices(word_choices);
-  best_choice->populate_unichars();
  delete raw_choice;
  raw_choice = new WERD_CHOICE(*best_choice);
  reject_map.initialise(blob_count);
@ -633,10 +632,6 @@ bool WERD_RES::ConditionalBlobMerge(
  }
  delete class_cb;
  delete box_cb;
-  if (modified) {
-    best_choice->populate_unichars();
-    raw_choice->populate_unichars();
-  }
  return modified;
 }

--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@ -223,8 +223,6 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
 * reverse_and_mirror_unichar_ids
 *
 * Reverses and mirrors unichars in unichar_ids.
- * Note: this function does not change unichar_string_, it only modifies
- * unichar_ids array.
 */
 void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
  for (int i = 0; i < length_/2; ++i) {
@ -358,8 +356,6 @@ WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
             second.permuter() != permuter_) {
    permuter_ = COMPOUND_PERM;
  }
-  unichar_string_ += second.unichar_string();
-  unichar_lengths_ += second.unichar_lengths();

  // Append a deep copy of second blob_choices if it exists.
  if (second.blob_choices_ != NULL) {
@ -412,8 +408,6 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
  certainty_ = source.certainty();
  permuter_ = source.permuter();
  fragment_mark_ = source.fragment_mark();
-  unichar_string_ = source.unichar_string();
-  unichar_lengths_ = source.unichar_lengths();

  // Delete existing blob_choices
  this->delete_blob_choices();
@ -633,15 +627,8 @@ void print_char_choices_list(const char *msg,
 */
 void print_word_alternates_list(
    WERD_CHOICE *word,
-    GenericVector<WERD_CHOICE *> *alternates,
-    bool needs_populate_unichars) {
+    GenericVector<WERD_CHOICE *> *alternates) {
  if (!word || !alternates) return;
-  if (needs_populate_unichars) {
-    word->populate_unichars();
-    for (int i = 0; i < alternates->size(); ++i) {
-      alternates->get(i)->populate_unichars();
-    }
-  }

  STRING alternates_str;
  for (int i = 0; i < alternates->size(); i++) {
--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@ -297,8 +297,6 @@ class WERD_CHOICE {
    fragment_mark_ = false;
    blob_choices_ = NULL;
    unichars_in_script_order_ = false;  // Tesseract is strict left-to-right.
-    unichar_string_ = "";
-    unichar_lengths_ = "";
  }

  /// Helper function to build a WERD_CHOICE from the given string,
@ -316,8 +314,6 @@ class WERD_CHOICE {
    rating_ = kBadRating;
    certainty_ = -MAX_FLOAT32;
    fragment_mark_ = false;
-    unichar_string_ = "";
-    unichar_lengths_ = "";
  }

  /// This function assumes that there is enough space reserved
@ -373,19 +369,6 @@ class WERD_CHOICE {
    }
    return word_str;
  }
-  /// Since this function walks over the whole word to convert unichar ids
-  /// to unichars, it is best to call it once, e.g. after all changes to
-  /// unichar_ids_ in WERD_CHOICE are finished.
-  void populate_unichars() {
-    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
-  }
-
-  /// Undoes populate_unichars, so that unichar_string_ and unichar_lengths_
-  /// are empty.
-  void depopulate_unichars() {
-    unichar_string_ = "";
-    unichar_lengths_ = "";
-  }

  // Call this to override the default (strict left to right graphemes)
  // with the fact that some engine produces a "reading order" set of
@ -398,19 +381,17 @@ class WERD_CHOICE {
    return unichars_in_script_order_;
  }

-  /// This function should only be called if populate_unichars()
-  /// was called and WERD_CHOICE did not change since then.
+  // Returns a UTF-8 string equivalent to the current choice
+  // of UNICHAR IDs.
  const STRING &unichar_string() const {
-    assert(unichar_string_.length() <= 0 ||
-           unichar_string_.length() >= length_);  // sanity check
+    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
    return unichar_string_;
  }

-  /// This function should only be called if populate_unichars()
-  /// was called and WERD_CHOICE did not change since then.
+  // Returns the lengths, one byte each, representing the number of bytes
+  // required in the unichar_string for each UNICHAR_ID.
  const STRING &unichar_lengths() const {
-    assert(unichar_lengths_.length() <= 0 ||
-           unichar_lengths_.length() == length_);  // sanity check
+    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
    return unichar_lengths_;
  }
  const void print() const { this->print(""); }
@ -441,10 +422,10 @@ class WERD_CHOICE {
  // (for Arabic, that is right-to-left).
  bool unichars_in_script_order_;

-  // The following variables are only populated by calling populate_unichars().
-  // They are not synchronized with the values in unichar_ids otherwise.
-  STRING unichar_string_;
-  STRING unichar_lengths_;
+  // The following variables are populated and passed by reference any
+  // time unichar_string() or unichar_lengths() are called.
+  mutable STRING unichar_string_;
+  mutable STRING unichar_lengths_;

  bool unichar_info_present;

@ -484,7 +465,6 @@ void print_char_choices_list(
    );
 void print_word_alternates_list(
    WERD_CHOICE *word,
-    GenericVector<WERD_CHOICE *> *alternates,
-    bool needs_populate_unichars);
+    GenericVector<WERD_CHOICE *> *alternates);

 #endif
--- a/dict/hyphen.cpp
+++ b/dict/hyphen.cpp
@ -59,7 +59,6 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
    // Remove the last unichar id as it is a hyphen, and remove
    // any unichar_string/lengths that are present.
    hyphen_word_->remove_last_unichar_id();
-    hyphen_word_->depopulate_unichars();
    hyphen_active_dawgs_ = active_dawgs;
    hyphen_constraints_ = constraints;
  }
--- a/dict/permute.cpp
+++ b/dict/permute.cpp
@ -464,7 +464,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
     }

     if (part_choice && step > 1) {   // found lexicon match
-       part_choice->populate_unichars();
       get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
       float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
       if (permuter_state)
@ -496,7 +495,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
              best_choice->rating(), match_score, adjusted_score);
    best_choice->set_rating(adjusted_score);
  }
-  best_choice->populate_unichars();
  if (permute_debug)
    tprintf("Found Best CJK word %f: %s\n",
            best_choice->rating(), best_choice->unichar_string().string());
@ -649,7 +647,6 @@ WERD_CHOICE* Dict::permute_chartype_words(
  // All permuter choices should go through adjust_non_word so the choice
  // rating would be adjusted on the same scale.
  adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars();
  if (replaced) {
    // Apply a reward multiplier on rating if an chartype permutation is made.
    float rating = current_word->rating();
@ -748,7 +745,6 @@ WERD_CHOICE* Dict::permute_script_words(
  // All permuter choices should go through adjust_non_word so the choice
  // rating would be adjusted on the same scale.
  adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars();
  if (replaced) {
    // Apply a reward multiplier on rating if an script permutation is made.
    float rating = current_word->rating();
@ -769,7 +765,6 @@ WERD_CHOICE* Dict::permute_script_words(
 bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                              WERD_CHOICE *best_choice,
                              WERD_CHOICE *raw_choice) {
-  float old_raw_choice_rating = raw_choice->rating();
  if (permute_debug) {
    tprintf("\n\n\n##### Permute_Characters #######\n");
    print_char_choices_list("\n==> Input CharChoices", char_choices,
@ -781,18 +776,8 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
      get_top_choice_uid(char_choices.get(0)) == 0) return false;
  WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);

-  if (raw_choice->rating() < old_raw_choice_rating) {
-    // Populate unichars_ and unichar_lengths_ of raw_choice. This is
-    // needed for various components that still work with unichars rather
-    // than unichar ids (e.g. LearnWord).
-    raw_choice->populate_unichars();
-  }
  if (this_choice && this_choice->rating() < best_choice->rating()) {
    *best_choice = *this_choice;
-    // Populate unichars_ and unichar_lengths_ of best_choice. This is
-    // needed for various components that still work with unichars rather
-    // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
-    best_choice->populate_unichars();

    if (permute_debug) {
      best_choice->print("\n**** Populate BestChoice");
@ -914,8 +899,6 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
             current_word->debug_string().string(),
             current_word->rating(), current_word->certainty());
  }
-  current_word->populate_unichars();
-
  EnableChoiceAccum();
 }

--- a/dict/stopper.cpp
+++ b/dict/stopper.cpp
@ -703,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
      }  // end searching AmbigSpec_LIST
    }  // end searching best_choice
  }  // end searching replace and dangerous ambigs
-  if (modified_best_choice) best_choice->populate_unichars();
+
  // If any ambiguities were found permute the constructed ambig_blob_choices
  // to see if an alternative dictionary word can be found.
  if (ambigs_found) {
--- a/wordrec/language_model.cpp
+++ b/wordrec/language_model.cpp
@ -190,9 +190,6 @@ void LanguageModel::InitForWord(

  // Fill prev_word_str_ with the last language_model_ngram_order
  // unichars from prev_word.
-  // Assume that populate_unichars() has been called on a valid prev_word,
-  // which is the case, since it points to the final result of the
-  // classification of the previous word.
  if (language_model_ngram_on) {
    if (prev_word != NULL && prev_word->unichar_string() != NULL) {
      prev_word_str_ = prev_word->unichar_string();
@ -1234,7 +1231,6 @@ void LanguageModel::UpdateBestChoice(
    dict_->LogNewChoice(1.0, certainties, true, word);
    *(best_choice_bundle->raw_choice) = *word;
    best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
-    best_choice_bundle->raw_choice->populate_unichars();
    if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
  }

@ -1278,7 +1274,6 @@ void LanguageModel::UpdateBestChoice(
      }
      // Update best_choice_bundle.
      *(best_choice_bundle->best_choice) = *word;
-      best_choice_bundle->best_choice->populate_unichars();
      best_choice_bundle->updated = true;
      best_choice_bundle->best_char_choices->delete_data_pointers();
      best_choice_bundle->best_char_choices->clear();
--- a/wordrec/wordrec.cpp
+++ b/wordrec/wordrec.cpp
@ -189,7 +189,6 @@ void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
    alt_choice->set_rating(choice->Rating);
    alt_choice->set_certainty(choice->Certainty);
    word->alt_choices.push_back(alt_choice);
-    alt_choice->populate_unichars();
    if (wordrec_debug_level > 0) {
      tprintf("SaveAltChoices: %s %g\n",
              alt_choice->unichar_string().string(), alt_choice->rating());