From 018f192fc22283bb652b85c792b5e01767c61730 Mon Sep 17 00:00:00 2001
From: "david.eger@gmail.com"
 <david.eger@gmail.com@d0cd1f9f-072b-0410-8dd7-cf729c803f20>
Date: Wed, 15 Feb 2012 01:37:00 +0000
Subject: [PATCH] Abolish populate_unichars(), fixing seg fault reported in
 Debian: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658634

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@675 d0cd1f9f-072b-0410-8dd7-cf729c803f20
---
 ccmain/applybox.cpp        |  2 --
 ccmain/control.cpp         |  9 ++------
 ccmain/cube_control.cpp    |  1 -
 ccmain/docqual.cpp         |  3 ---
 ccmain/output.cpp          |  1 -
 ccmain/reject.cpp          |  6 ------
 ccmain/tfacepp.cpp         |  2 --
 ccstruct/pageres.cpp       |  5 -----
 ccstruct/ratngs.cpp        | 15 +-------------
 ccstruct/ratngs.h          | 42 ++++++++++----------------------------
 dict/hyphen.cpp            |  1 -
 dict/permute.cpp           | 17 ---------------
 dict/stopper.cpp           |  2 +-
 wordrec/language_model.cpp |  5 -----
 wordrec/wordrec.cpp        |  1 -
 15 files changed, 15 insertions(+), 97 deletions(-)

diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp
index 2d183eb3..4ac3d396 100644
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@@ -249,7 +249,6 @@ static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
     word_choice->append_unichar_id(bc->unichar_id(), 1,
                                    bc->rating(), bc->certainty());
   }
-  word_choice->populate_unichars();
 }
 
 // Tests the chopper by exhaustively running chop_one_blob.
@@ -776,7 +775,6 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
       choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
     }
-    choice->populate_unichars();
     if (word_res->best_choice != NULL)
       delete word_res->best_choice;
     word_res->best_choice = choice;
diff --git a/ccmain/control.cpp b/ccmain/control.cpp
index a28b9a00..2901f4e7 100644
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@@ -420,7 +420,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
 
   WERD_RES *w_prev = NULL;
   WERD_RES *w = word_it.word();
-  if (w && w->best_choice) w->best_choice->populate_unichars();
   while (1) {
     w_prev = w;
     while (word_it.forward() != NULL &&
@@ -429,8 +428,6 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
     }
     if (!word_it.word()) break;
     w = word_it.word();
-    if (w && w->best_choice)
-      w->best_choice->populate_unichars();
     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
       continue;
     }
@@ -490,11 +487,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
     }
     if (tessedit_bigram_debug > 1) {
       if (w_prev->alt_choices.size() > 1) {
-        print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices,
-                                   false);
+        print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
       }
       if (w->alt_choices.size() > 1) {
-        print_word_alternates_list(w->best_choice, &w->alt_choices, false);
+        print_word_alternates_list(w->best_choice, &w->alt_choices);
       }
     }
     float best_rating = 0.0;
@@ -1244,7 +1240,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
   } else {
     // Just correct existing classification.
     CorrectRepcharChoices(best_choice, word_res);
-    word_res->best_choice->populate_unichars();
     word_res->reject_map.initialise(word.length());
   }
 }
diff --git a/ccmain/cube_control.cpp b/ccmain/cube_control.cpp
index cd5cc741..5b222a12 100644
--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@@ -187,7 +187,6 @@ static WERD_CHOICE *create_werd_choice(
     // Add list to the clist
     blob_choices_it.add_to_end(choices_list);
   }
-  werd_choice->populate_unichars();
   werd_choice->set_certainty(certainty);
   werd_choice->set_blob_choices(blob_choices);
   return werd_choice;
diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp
index c8fed20f..4d9ce4a3 100644
--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@@ -684,9 +684,6 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
         word_res->reject_map[i].setrej_unlv_rej ();
     }
   }
-  if (modified) {
-    word_res->best_choice->populate_unichars();
-  }
 }
 
 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
diff --git a/ccmain/output.cpp b/ccmain/output.cpp
index 95ed7214..3081874e 100644
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@@ -248,7 +248,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
       BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
       if (!blob_choices_it.empty()) delete blob_choices_it.extract();
     }
-    word->best_choice->populate_unichars();
     word->reject_map.remove_pos (0);
     word->box_word->DeleteBox(0);
   }
diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp
index 94497bfa..d9ef14c6 100644
--- a/ccmain/reject.cpp
+++ b/ccmain/reject.cpp
@@ -847,9 +847,6 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
     }
     prev_right = out_box.right();
   }
-  if (modified) {
-    best_choice->populate_unichars();
-  }
 }
 
 // Note: After running this function word_res->best_choice->blob_choices()
@@ -975,9 +972,6 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
       }
     }
   }
-  if (modified) {
-    best_choice->populate_unichars();
-  }
 }
 
 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
diff --git a/ccmain/tfacepp.cpp b/ccmain/tfacepp.cpp
index d8459328..a3eeaaea 100644
--- a/ccmain/tfacepp.cpp
+++ b/ccmain/tfacepp.cpp
@@ -130,7 +130,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
       word->raw_choice->append_unichar_id(space_id, 1, 0.0,
                                           word->raw_choice->certainty());
     }
-    word->raw_choice->populate_unichars();
   }
 
   // Do sanity checks and minor fixes on best_choice.
@@ -164,7 +163,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
       word->best_choice->append_unichar_id(space_id, 1, 0.0,
                                            word->best_choice->certainty());
     }
-    word->best_choice->populate_unichars();
   }
 }
 
diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp
index 1ce7c502..6696ce02 100644
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@@ -563,7 +563,6 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
     bc_it.add_after_then_move(choice_list);
   }
   best_choice->set_blob_choices(word_choices);
-  best_choice->populate_unichars();
   delete raw_choice;
   raw_choice = new WERD_CHOICE(*best_choice);
   reject_map.initialise(blob_count);
@@ -633,10 +632,6 @@ bool WERD_RES::ConditionalBlobMerge(
   }
   delete class_cb;
   delete box_cb;
-  if (modified) {
-    best_choice->populate_unichars();
-    raw_choice->populate_unichars();
-  }
   return modified;
 }
 
diff --git a/ccstruct/ratngs.cpp b/ccstruct/ratngs.cpp
index 02855851..0ba60ceb 100644
--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@@ -223,8 +223,6 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
  * reverse_and_mirror_unichar_ids
  *
  * Reverses and mirrors unichars in unichar_ids.
- * Note: this function does not change unichar_string_, it only modifies
- * unichar_ids array.
  */
 void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
   for (int i = 0; i < length_/2; ++i) {
@@ -358,8 +356,6 @@ WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
              second.permuter() != permuter_) {
     permuter_ = COMPOUND_PERM;
   }
-  unichar_string_ += second.unichar_string();
-  unichar_lengths_ += second.unichar_lengths();
 
   // Append a deep copy of second blob_choices if it exists.
   if (second.blob_choices_ != NULL) {
@@ -412,8 +408,6 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
   certainty_ = source.certainty();
   permuter_ = source.permuter();
   fragment_mark_ = source.fragment_mark();
-  unichar_string_ = source.unichar_string();
-  unichar_lengths_ = source.unichar_lengths();
 
   // Delete existing blob_choices
   this->delete_blob_choices();
@@ -633,15 +627,8 @@ void print_char_choices_list(const char *msg,
  */
 void print_word_alternates_list(
     WERD_CHOICE *word,
-    GenericVector<WERD_CHOICE *> *alternates,
-    bool needs_populate_unichars) {
+    GenericVector<WERD_CHOICE *> *alternates) {
   if (!word || !alternates) return;
-  if (needs_populate_unichars) {
-    word->populate_unichars();
-    for (int i = 0; i < alternates->size(); ++i) {
-      alternates->get(i)->populate_unichars();
-    }
-  }
 
   STRING alternates_str;
   for (int i = 0; i < alternates->size(); i++) {
diff --git a/ccstruct/ratngs.h b/ccstruct/ratngs.h
index 4145aa0c..481847ec 100644
--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@@ -297,8 +297,6 @@ class WERD_CHOICE {
     fragment_mark_ = false;
     blob_choices_ = NULL;
     unichars_in_script_order_ = false;  // Tesseract is strict left-to-right.
-    unichar_string_ = "";
-    unichar_lengths_ = "";
   }
 
   /// Helper function to build a WERD_CHOICE from the given string,
@@ -316,8 +314,6 @@ class WERD_CHOICE {
     rating_ = kBadRating;
     certainty_ = -MAX_FLOAT32;
     fragment_mark_ = false;
-    unichar_string_ = "";
-    unichar_lengths_ = "";
   }
 
   /// This function assumes that there is enough space reserved
@@ -373,19 +369,6 @@ class WERD_CHOICE {
     }
     return word_str;
   }
-  /// Since this function walks over the whole word to convert unichar ids
-  /// to unichars, it is best to call it once, e.g. after all changes to
-  /// unichar_ids_ in WERD_CHOICE are finished.
-  void populate_unichars() {
-    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
-  }
-
-  /// Undoes populate_unichars, so that unichar_string_ and unichar_lengths_
-  /// are empty.
-  void depopulate_unichars() {
-    unichar_string_ = "";
-    unichar_lengths_ = "";
-  }
 
   // Call this to override the default (strict left to right graphemes)
   // with the fact that some engine produces a "reading order" set of
@@ -398,19 +381,17 @@ class WERD_CHOICE {
     return unichars_in_script_order_;
   }
 
-  /// This function should only be called if populate_unichars()
-  /// was called and WERD_CHOICE did not change since then.
+  // Returns a UTF-8 string equivalent to the current choice
+  // of UNICHAR IDs.
   const STRING &unichar_string() const {
-    assert(unichar_string_.length() <= 0 ||
-           unichar_string_.length() >= length_);  // sanity check
+    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
     return unichar_string_;
   }
 
-  /// This function should only be called if populate_unichars()
-  /// was called and WERD_CHOICE did not change since then.
+  // Returns the lengths, one byte each, representing the number of bytes
+  // required in the unichar_string for each UNICHAR_ID.
   const STRING &unichar_lengths() const {
-    assert(unichar_lengths_.length() <= 0 ||
-           unichar_lengths_.length() == length_);  // sanity check
+    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
     return unichar_lengths_;
   }
   const void print() const { this->print(""); }
@@ -441,10 +422,10 @@ class WERD_CHOICE {
   // (for Arabic, that is right-to-left).
   bool unichars_in_script_order_;
 
-  // The following variables are only populated by calling populate_unichars().
-  // They are not synchronized with the values in unichar_ids otherwise.
-  STRING unichar_string_;
-  STRING unichar_lengths_;
+  // The following variables are populated and passed by reference any
+  // time unichar_string() or unichar_lengths() are called.
+  mutable STRING unichar_string_;
+  mutable STRING unichar_lengths_;
 
   bool unichar_info_present;
 
@@ -484,7 +465,6 @@ void print_char_choices_list(
     );
 void print_word_alternates_list(
     WERD_CHOICE *word,
-    GenericVector<WERD_CHOICE *> *alternates,
-    bool needs_populate_unichars);
+    GenericVector<WERD_CHOICE *> *alternates);
 
 #endif
diff --git a/dict/hyphen.cpp b/dict/hyphen.cpp
index 1f39afdc..9f4296e8 100644
--- a/dict/hyphen.cpp
+++ b/dict/hyphen.cpp
@@ -59,7 +59,6 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
     // Remove the last unichar id as it is a hyphen, and remove
     // any unichar_string/lengths that are present.
     hyphen_word_->remove_last_unichar_id();
-    hyphen_word_->depopulate_unichars();
     hyphen_active_dawgs_ = active_dawgs;
     hyphen_constraints_ = constraints;
   }
diff --git a/dict/permute.cpp b/dict/permute.cpp
index e4111669..d0fc13f3 100644
--- a/dict/permute.cpp
+++ b/dict/permute.cpp
@@ -464,7 +464,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
      }
 
      if (part_choice && step > 1) {   // found lexicon match
-       part_choice->populate_unichars();
        get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
        float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
        if (permuter_state)
@@ -496,7 +495,6 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
               best_choice->rating(), match_score, adjusted_score);
     best_choice->set_rating(adjusted_score);
   }
-  best_choice->populate_unichars();
   if (permute_debug)
     tprintf("Found Best CJK word %f: %s\n",
             best_choice->rating(), best_choice->unichar_string().string());
@@ -649,7 +647,6 @@ WERD_CHOICE* Dict::permute_chartype_words(
   // All permuter choices should go through adjust_non_word so the choice
   // rating would be adjusted on the same scale.
   adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars();
   if (replaced) {
     // Apply a reward multiplier on rating if an chartype permutation is made.
     float rating = current_word->rating();
@@ -748,7 +745,6 @@ WERD_CHOICE* Dict::permute_script_words(
   // All permuter choices should go through adjust_non_word so the choice
   // rating would be adjusted on the same scale.
   adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars();
   if (replaced) {
     // Apply a reward multiplier on rating if an script permutation is made.
     float rating = current_word->rating();
@@ -769,7 +765,6 @@ WERD_CHOICE* Dict::permute_script_words(
 bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                               WERD_CHOICE *best_choice,
                               WERD_CHOICE *raw_choice) {
-  float old_raw_choice_rating = raw_choice->rating();
   if (permute_debug) {
     tprintf("\n\n\n##### Permute_Characters #######\n");
     print_char_choices_list("\n==> Input CharChoices", char_choices,
@@ -781,18 +776,8 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
       get_top_choice_uid(char_choices.get(0)) == 0) return false;
   WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);
 
-  if (raw_choice->rating() < old_raw_choice_rating) {
-    // Populate unichars_ and unichar_lengths_ of raw_choice. This is
-    // needed for various components that still work with unichars rather
-    // than unichar ids (e.g. LearnWord).
-    raw_choice->populate_unichars();
-  }
   if (this_choice && this_choice->rating() < best_choice->rating()) {
     *best_choice = *this_choice;
-    // Populate unichars_ and unichar_lengths_ of best_choice. This is
-    // needed for various components that still work with unichars rather
-    // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
-    best_choice->populate_unichars();
 
     if (permute_debug) {
       best_choice->print("\n**** Populate BestChoice");
@@ -914,8 +899,6 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
              current_word->debug_string().string(),
              current_word->rating(), current_word->certainty());
   }
-  current_word->populate_unichars();
-
   EnableChoiceAccum();
 }
 
diff --git a/dict/stopper.cpp b/dict/stopper.cpp
index 01d99f09..75a9657d 100644
--- a/dict/stopper.cpp
+++ b/dict/stopper.cpp
@@ -703,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
       }  // end searching AmbigSpec_LIST
     }  // end searching best_choice
   }  // end searching replace and dangerous ambigs
-  if (modified_best_choice) best_choice->populate_unichars();
+
   // If any ambiguities were found permute the constructed ambig_blob_choices
   // to see if an alternative dictionary word can be found.
   if (ambigs_found) {
diff --git a/wordrec/language_model.cpp b/wordrec/language_model.cpp
index 712100f3..d441dab2 100644
--- a/wordrec/language_model.cpp
+++ b/wordrec/language_model.cpp
@@ -190,9 +190,6 @@ void LanguageModel::InitForWord(
 
   // Fill prev_word_str_ with the last language_model_ngram_order
   // unichars from prev_word.
-  // Assume that populate_unichars() has been called on a valid prev_word,
-  // which is the case, since it points to the final result of the
-  // classification of the previous word.
   if (language_model_ngram_on) {
     if (prev_word != NULL && prev_word->unichar_string() != NULL) {
       prev_word_str_ = prev_word->unichar_string();
@@ -1234,7 +1231,6 @@ void LanguageModel::UpdateBestChoice(
     dict_->LogNewChoice(1.0, certainties, true, word);
     *(best_choice_bundle->raw_choice) = *word;
     best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
-    best_choice_bundle->raw_choice->populate_unichars();
     if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
   }
 
@@ -1278,7 +1274,6 @@ void LanguageModel::UpdateBestChoice(
       }
       // Update best_choice_bundle.
       *(best_choice_bundle->best_choice) = *word;
-      best_choice_bundle->best_choice->populate_unichars();
       best_choice_bundle->updated = true;
       best_choice_bundle->best_char_choices->delete_data_pointers();
       best_choice_bundle->best_char_choices->clear();
diff --git a/wordrec/wordrec.cpp b/wordrec/wordrec.cpp
index e258e073..ec684984 100644
--- a/wordrec/wordrec.cpp
+++ b/wordrec/wordrec.cpp
@@ -189,7 +189,6 @@ void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
     alt_choice->set_rating(choice->Rating);
     alt_choice->set_certainty(choice->Certainty);
     word->alt_choices.push_back(alt_choice);
-    alt_choice->populate_unichars();
     if (wordrec_debug_level > 0) {
       tprintf("SaveAltChoices: %s %g\n",
               alt_choice->unichar_string().string(), alt_choice->rating());