Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 09:52:40 +08:00 · 2013-09-23 15:26:50 +00:00 · 2013-09-23 15:26:50 +00:00 · 4d514d5a60
commit 4d514d5a60
parent 2c909702c9
187 changed files with 40713 additions and 14004 deletions
--- a/12
+++ b/12
@ -1,3 +1,15 @@
+2013-09-20 v3.03
+* Added Renderer to API to allow document-level processing and output
+  of document formats, like hOCR, PDF.
+* Major refactor of word-level recognition, beam search, eliminating dead code.
+* Refactored classifier to make it easier to add new ones.
+* Generalized feature extractor to allow feature extraction from greyscale.
+* Improved sub/superscript treatment.
+* Improved baseline fit.
+* Added set_unicharset_properties to training tools.
+* Many bug fixes.
+
+
 2012-02-01 - v3.02
  * Moved ResultIterator/PageIterator to ccmain.
  * Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.
--- a/api/Makefile.am
+++ b/api/Makefile.am
@ -9,7 +9,7 @@ if VISIBILITY
 AM_CPPFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
 endif

-include_HEADERS = apitypes.h baseapi.h capi.h
+include_HEADERS = apitypes.h baseapi.h capi.h renderer.h
 lib_LTLIBRARIES = 

 if !USING_MULTIPLELIBS
@ -35,7 +35,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
 if VISIBILITY
 libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
 endif
-libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
+libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp

 lib_LTLIBRARIES += libtesseract.la
 libtesseract_la_LDFLAGS = 
--- a/api/capi.cpp
+++ b/api/capi.cpp
@ -2,6 +2,8 @@
 #   define TESS_CAPI_INCLUDE_BASEAPI
 #endif
 #include "capi.h"
+#include "genericvector.h"
+#include "strngs.h"

 TESS_API const char* TESS_CALL TessVersion()
 {
@ -382,10 +384,10 @@ TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* resu
    return handle->DetectOS(results) ? TRUE : FALSE;
 }

-TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
+TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
                                                            int* num_features, int* FeatureOutlineIndex)
 {
-    handle->GetFeaturesForBlob(blob, *denorm, int_features, num_features, FeatureOutlineIndex);
+    handle->GetFeaturesForBlob(blob, int_features, num_features, FeatureOutlineIndex);
 }

 TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom)
@ -393,10 +395,10 @@ TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top,
    return TessBaseAPI::FindRowForBox(blocks, left, top, right, bottom);
 }

-TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
+TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
                                                               int* unichar_ids, float* ratings, int* num_matches_returned)
 {
-    handle->RunAdaptiveClassifier(blob, *denorm, num_max_matches, unichar_ids, ratings, num_matches_returned);
+    handle->RunAdaptiveClassifier(blob, num_max_matches, unichar_ids, ratings, num_matches_returned);
 }

 TESS_API const char* TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id)
@ -424,9 +426,9 @@ TESS_API TBLOB* TESS_CALL TessMakeTBLOB(struct Pix *pix)
    return TessBaseAPI::MakeTBLOB(pix);
 }

-TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm)
+TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode)
 {
-    TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE, denorm);
+    TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE);
 }

 TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle)
--- a/api/capi.h
+++ b/api/capi.h
@ -205,11 +205,11 @@ TESS_API void  TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* han
 TESS_API void  TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f);
 TESS_API BOOL  TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results);

-TESS_API void  TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
+TESS_API void  TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
                                                       int* num_features, int* FeatureOutlineIndex);

 TESS_API ROW*  TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom);
-TESS_API void  TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
+TESS_API void  TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
                                                          int* unichar_ids, float* ratings, int* num_matches_returned);
 #endif

@ -226,7 +226,7 @@ TESS_API int   TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle);
 TESS_API ROW*  TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender);
 TESS_API TBLOB*
               TESS_CALL TessMakeTBLOB(Pix *pix);
-TESS_API void  TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm);
+TESS_API void  TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode);

 TESS_API TessOcrEngineMode
               TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle);
--- a/ccmain/Makefile.am
+++ b/ccmain/Makefile.am
@ -19,7 +19,7 @@ noinst_HEADERS = \
    equationdetect.h fixspace.h imgscale.h mutableiterator.h osdetect.h \
    output.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \
    reject.h scaleimg.h tessbox.h tessedit.h tesseractclass.h \
-    tesseract_cube_combiner.h tessvars.h tfacep.h tfacepp.h werdit.h
+    tesseract_cube_combiner.h tessvars.h werdit.h

 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_main.la
@ -46,7 +46,7 @@ libtesseract_main_la_SOURCES = \
    imgscale.cpp ltrresultiterator.cpp \
    osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \
    pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \
-    reject.cpp resultiterator.cpp scaleimg.cpp \
+    reject.cpp resultiterator.cpp scaleimg.cpp superscript.cpp \
    tesseract_cube_combiner.cpp \
    tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
    tfacepp.cpp thresholder.cpp \
--- a/ccmain/adaptions.cpp
+++ b/ccmain/adaptions.cpp
@ -114,27 +114,12 @@ BOOL8 Tesseract::word_adaptable(  //should we adapt?
    return FALSE;
  }

-//  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
  if (flags.bit (CHECK_AMBIG_WERD) &&
-      !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
+      word->best_choice->dangerous_ambig_found()) {
    if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
    return FALSE;
  }

-  // Do not adapt to words that are composed from fragments if
-  // tessedit_adapt_to_char_fragments is false.
-  if (!tessedit_adapt_to_char_fragments) {
-    const char *fragment_lengths = word->best_choice->fragment_lengths();
-    if (fragment_lengths != NULL && *fragment_lengths != '\0') {
-      for (int i = 0; i < word->best_choice->length(); ++i) {
-        if (fragment_lengths[i] > 1) {
-          if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
-          return false;  // found a character composed from fragments
-        }
-      }
-    }
-  }
-
  if (tessedit_adaption_debug) {
    tprintf("returning status %d\n", status);
  }
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -235,21 +235,6 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
  return page_res;
 }

-// Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
-// the top choices. Avoids problems with very long words.
-static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
-                           const UNICHARSET& unicharset,
-                           WERD_CHOICE* word_choice) {
-  *word_choice = WERD_CHOICE(&unicharset);  // clear the word choice.
-  word_choice->make_bad();
-  for (int i = 0; i < char_choices.size(); ++i) {
-    BLOB_CHOICE_IT it(char_choices[i]);
-    BLOB_CHOICE* bc = it.data();
-    word_choice->append_unichar_id(bc->unichar_id(), 1,
-                                   bc->rating(), bc->certainty());
-  }
-}
-
 // Tests the chopper by exhaustively running chop_one_blob.
 // The word_res will contain filled chopped_word, seam_array, denorm,
 // box_word and best_state for the maximally chopped word.
@ -257,7 +242,8 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
                                  BLOCK* block, ROW* row,
                                  WERD_RES* word_res) {
  if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
-                                         this->textord_use_cjk_fp_model,
+                                         textord_use_cjk_fp_model,
+                                         poly_allow_detailed_fx,
                                         row, block)) {
    word_res->CloneChoppedToRebuild();
    return;
@ -266,13 +252,10 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
    tprintf("Maximally chopping word at:");
    word_res->word->bounding_box().print();
  }
-  blob_match_table.init_match_table();
-  BLOB_CHOICE_LIST *match_result;
-  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
-  ASSERT_HOST(word_res->chopped_word->blobs != NULL);
+  GenericVector<BLOB_CHOICE*> blob_choices;
+  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
  float rating = static_cast<float>(MAX_INT8);
-  for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
-       blob = blob->next) {
+  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
    // The rating and certainty are not quite arbitrary. Since
    // select_blob_to_chop uses the worst certainty to choose, they all have
    // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
@ -281,32 +264,33 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
    // produced, however much chopping is required. The chops are thus only
    // limited by the ability of the chopper to find suitable chop points,
    // and not by the value of the certainties.
-    match_result = fake_classify_blob(0, rating, -rating);
-    modify_blob_choice(match_result, 0);
-    ASSERT_HOST(!match_result->empty());
-    *char_choices += match_result;
+    BLOB_CHOICE* choice =
+        new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
+    blob_choices.push_back(choice);
    rating -= 0.125f;
  }
-  inT32 blob_number;
+  const double e = exp(1.0);  // The base of natural logs.
+  int blob_number;
  int right_chop_index = 0;
  if (!assume_fixed_pitch_char_segment) {
    // We only chop if the language is not fixed pitch like CJK.
-    if (prioritize_division) {
-      while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
-    } else {
-      while (chop_one_blob(word_res->chopped_word, char_choices,
-                           &blob_number, &word_res->seam_array,
-                           &right_chop_index));
+    SEAM* seam = NULL;
+    while ((seam = chop_one_blob(boxes, blob_choices, word_res,
+                                 &blob_number)) != NULL) {
+      word_res->InsertSeam(blob_number, seam);
+      BLOB_CHOICE* left_choice = blob_choices[blob_number];
+      rating = left_choice->rating() / e;
+      left_choice->set_rating(rating);
+      left_choice->set_certainty(-rating);
+      // combine confidence w/ serial #
+      BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
+                                                  rating - 0.125f, -rating,
+                                                  -1, -1, 0, 0, 0, 0, BCC_FAKE);
+      blob_choices.insert(right_choice, blob_number + 1);
    }
  }
-  MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
-  MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
  word_res->CloneChoppedToRebuild();
-  blob_match_table.end_match_table();
-  if (char_choices != NULL) {
-    char_choices->delete_data_pointers();
-    delete char_choices;
-  }
+  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
 }

 // Helper to compute the dispute resolution metric.
@ -558,7 +542,6 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
 // substitutions ARE used.
 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                 WERD_RES* word_res) {
-  blob_match_table.init_match_table();
  // Classify all required combinations of blobs and save results in choices.
  int word_length = word_res->box_word->length();
  GenericVector<BLOB_CHOICE_LIST*>* choices =
@ -566,8 +549,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
  for (int i = 0; i < word_length; ++i) {
    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
      BLOB_CHOICE_LIST* match_result = classify_piece(
-          word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
-          i, i + j - 1, word_res->blamer_bundle);
+          word_res->seam_array, i, i + j - 1, "Applybox",
+          word_res->chopped_word, word_res->blamer_bundle);
      if (applybox_debug > 2) {
        tprintf("%d+%d:", i, j);
        print_ratings_list("Segment:", match_result, unicharset);
@ -583,7 +566,6 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
  float best_rating = 0.0f;
  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
                &search_segmentation, &best_rating, &word_res->best_state);
-  blob_match_table.end_match_table();
  for (int i = 0; i < word_length; ++i)
    choices[i].delete_data_pointers();
  delete [] choices;
@ -591,9 +573,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
    // Build the original segmentation and if it is the same length as the
    // truth, assume it will do.
    int blob_count = 1;
-    for (int s = 0; s < array_count(word_res->seam_array); ++s) {
-      SEAM* seam =
-          reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
+    for (int s = 0; s < word_res->seam_array.size(); ++s) {
+      SEAM* seam = word_res->seam_array[s];
      if (seam->split1 == NULL) {
        word_res->best_state.push_back(blob_count);
        blob_count = 1;
@ -707,21 +688,25 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
  WERD_RES* word_res;
  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
    int ok_in_word = 0;
-    BLOB_CHOICE_LIST_VECTOR char_choices;
-    for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
-      if (word_res->correct_text[i].length() > 0) {
+    int blob_count = word_res->correct_text.size();
+    WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
+    word_choice->set_permuter(TOP_CHOICE_PERM);
+    for (int c = 0; c < blob_count; ++c) {
+      if (word_res->correct_text[c].length() > 0) {
        ++ok_in_word;
      }
      // Since we only need a fake word_res->best_choice, the actual
      // unichar_ids do not matter. Which is fortunate, since TidyUp()
      // can be called while training Tesseract, at the stage where
      // unicharset is not meaningful yet.
-      char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
+      word_choice->append_unichar_id_space_allocated(
+          INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
    }
    if (ok_in_word > 0) {
      ok_blob_count += ok_in_word;
      bad_blob_count += word_res->correct_text.size() - ok_in_word;
-      MakeWordChoice(char_choices, unicharset, word_res->best_choice);
+      word_res->LogNewRawChoice(word_choice);
+      word_res->LogNewCookedChoice(1, false, word_choice);
    } else {
      ++unlabelled_words;
      if (applybox_debug > 0) {
@ -730,7 +715,6 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
      }
      pr_it.DeleteCurrentWord();
    }
-    char_choices.delete_data_pointers();
  }
  pr_it.restart_page();
  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
@ -772,11 +756,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
      GenericVector<STRING> tokens;
      word_res->correct_text[i].split(' ', &tokens);
      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
-      choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
+      choice->append_unichar_id_space_allocated(char_id,
+                                                word_res->best_state[i],
+                                                0.0f, 0.0f);
    }
-    if (word_res->best_choice != NULL)
-      delete word_res->best_choice;
-    word_res->best_choice = choice;
+    word_res->ClearWordChoices();
+    word_res->LogNewRawChoice(choice);
+    word_res->LogNewCookedChoice(1, false, choice);
  }
 }

@ -787,7 +773,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
  int word_count = 0;
  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
-    LearnWord(filename.string(), NULL, word_res);
+    LearnWord(filename.string(), word_res);
    ++word_count;
  }
  tprintf("Generated training data for %d words\n", word_count);
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@ -29,7 +29,6 @@
 #include "ocrclass.h"
 #include "werdit.h"
 #include "drawfx.h"
-#include "tfacep.h"
 #include "tessbox.h"
 #include "tessvars.h"
 #include "pgedit.h"
@ -55,6 +54,9 @@
 const char* const kBackUpConfigFile = "tempconfigdata.config";
 // Multiple of x-height to make a repeated word have spaces in it.
 const double kRepcharGapThreshold = 0.5;
+// Min believable x-height for any text when refitting as a fraction of
+// original x-height
+const double kMinRefitXHeightFraction = 0.5;


 /**
@ -293,9 +295,9 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
      // Update misadaption log (we only need to do it on pass 1, since
      // adaption only happens on this pass).
      if (page_res_it.word()->blamer_bundle != NULL &&
-          page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
+          page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
        page_res->misadaption_log.push_back(
-            page_res_it.word()->blamer_bundle->misadaption_debug);
+            page_res_it.word()->blamer_bundle->misadaption_debug());
      }

      page_res_it.forward();
@ -308,7 +310,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
  page_res_it.restart_page();
  word_index = 0;
  most_recently_used_ = this;
-  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
+  while (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
+      page_res_it.word() != NULL) {
    set_global_loc_code(LOC_PASS2);
    word_index++;
    if (monitor != NULL) {
@ -382,17 +385,6 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
    blamer_pass(page_res);
  }

-  if (!save_blob_choices) {
-    // We aren't saving the blob choices so get rid of them now.
-    // set_blob_choices() does a deep clear.
-    page_res_it.restart_page();
-    while (page_res_it.word() != NULL) {
-      WERD_RES* word = page_res_it.word();
-      word->best_choice->set_blob_choices(NULL);
-      page_res_it.forward();
-    }
-  }
-
  // Write results pass.
  set_global_loc_code(LOC_WRITE_RESULTS);
  // This is now redundant, but retained commented so show how to obtain
@ -436,39 +428,21 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
      continue;
    }
    // Two words sharing the same language model, excellent!
-    if (w->alt_choices.empty()) {
-      if (tessedit_bigram_debug) {
-        tprintf("Alt choices not set up for word choice: %s\n",
-                w->best_choice->unichar_string().string());
-      }
-      continue;
-    }
-    if (w_prev->alt_choices.empty()) {
-      if (tessedit_bigram_debug) {
-        tprintf("Alt choices not set up for word choice: %s\n",
-                w_prev->best_choice->unichar_string().string());
-      }
-      continue;
-    }
-
-    // We saved alternate choices, excellent!
    GenericVector<WERD_CHOICE *> overrides_word1;
-    GenericVector<GenericVector<int> *> overrides_word1_state;
    GenericVector<WERD_CHOICE *> overrides_word2;
-    GenericVector<GenericVector<int> *> overrides_word2_state;

    STRING orig_w1_str = w_prev->best_choice->unichar_string();
    STRING orig_w2_str = w->best_choice->unichar_string();
    WERD_CHOICE prev_best(w->uch_set);
    {
      int w1start, w1end;
-      w_prev->WithoutFootnoteSpan(&w1start, &w1end);
+      w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
    }
    WERD_CHOICE this_best(w->uch_set);
    {
      int w2start, w2end;
-      w->WithoutFootnoteSpan(&w2start, &w2end);
+      w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
      this_best = w->best_choice->shallow_copy(w2start, w2end);
    }

@ -484,37 +458,36 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
              orig_w1_str.string(), orig_w2_str.string());
    }
    if (tessedit_bigram_debug > 1) {
-      if (w_prev->alt_choices.size() > 1) {
-        print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
+      if (!w_prev->best_choices.singleton()) {
+        w_prev->PrintBestChoices();
      }
-      if (w->alt_choices.size() > 1) {
-        print_word_alternates_list(w->best_choice, &w->alt_choices);
+      if (!w->best_choices.singleton()) {
+        w->PrintBestChoices();
      }
    }
    float best_rating = 0.0;
    int best_idx = 0;
-    for (int i = 0; i < w_prev->alt_choices.size(); i++) {
-      WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
+    WERD_CHOICE_IT prev_it(&w_prev->best_choices);
+    for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
+      WERD_CHOICE *p1 = prev_it.data();
      WERD_CHOICE strip1(w->uch_set);
      {
        int p1start, p1end;
-        w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
-                                    &p1start, &p1end);
+        p1->GetNonSuperscriptSpan(&p1start, &p1end);
        strip1 = p1->shallow_copy(p1start, p1end);
      }
-      for (int j = 0; j < w->alt_choices.size(); j++) {
-        WERD_CHOICE *p2 = w->alt_choices.get(j);
+      WERD_CHOICE_IT w_it(&w->best_choices);
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD_CHOICE *p2 = w_it.data();
        WERD_CHOICE strip2(w->uch_set);
        {
          int p2start, p2end;
-          w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
+          p2->GetNonSuperscriptSpan(&p2start, &p2end);
          strip2 = p2->shallow_copy(p2start, p2end);
        }
        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
          overrides_word1.push_back(p1);
-          overrides_word1_state.push_back(&w_prev->alt_states.get(i));
          overrides_word2.push_back(p2);
-          overrides_word2_state.push_back(&w->alt_states.get(j));
          if (overrides_word1.size() == 1 ||
              p1->rating() + p2->rating() < best_rating) {
            best_rating = p1->rating() + p2->rating();
@ -538,12 +511,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
      STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
      STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
      if (new_w1_str != orig_w1_str) {
-        w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
-                                  *overrides_word1_state[best_idx]);
+        w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
      }
      if (new_w2_str != orig_w2_str) {
-        w->ReplaceBestChoice(*overrides_word2[best_idx],
-                             *overrides_word2_state[best_idx]);
+        w->ReplaceBestChoice(overrides_word2[best_idx]);
      }
      if (tessedit_bigram_debug > 0) {
        STRING choices_description;
@ -684,34 +655,8 @@ void Tesseract::blamer_pass(PAGE_RES* page_res) {
  for (page_res_it.restart_page(); page_res_it.word() != NULL;
      page_res_it.forward()) {
    WERD_RES *word = page_res_it.word();
-    if (word->blamer_bundle == NULL) {
-      word->blamer_bundle = new BlamerBundle();
-      word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT;
-      word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason();
-      word->blamer_bundle->debug += " to blame";
-    } else if (word->blamer_bundle->incorrect_result_reason ==
-        IRR_NO_TRUTH) {
-      word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
-                                    word->best_choice, wordrec_debug_blamer);
-    } else {
-      bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
-                                     word->blamer_bundle->truth_text);
-      IncorrectResultReason irr =
-          word->blamer_bundle->incorrect_result_reason;
-      if (irr == IRR_CORRECT && !correct) {
-        STRING debug = "Choice is incorrect after recognition";
-        word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
-                                      word->best_choice,
-                                      wordrec_debug_blamer);
-      } else if (irr != IRR_CORRECT && correct) {
-        if (wordrec_debug_blamer) {
-          tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
-        }
-        word->blamer_bundle->incorrect_result_reason = IRR_CORRECT;
-        word->blamer_bundle->debug = "";
-      }
-    }
-    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++;
+    BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
+    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
  }
  tprintf("Blame reasons:\n");
  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
@ -730,7 +675,9 @@ void Tesseract::blamer_pass(PAGE_RES* page_res) {
 // Helper returns true if the new_word is better than the word, using a
 // simple test of better certainty AND rating (to reduce false positives
 // from cube) or a dictionary vs non-dictionary word.
-static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
+static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word,
+                          double rating_ratio,
+                          double certainty_margin) {
  if (new_word.best_choice == NULL) {
    return false;  // New one no good.
  }
@ -742,7 +689,11 @@ static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
    return true;  // New word has better confidence.
  }
  if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
-      Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) {
+      Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
+      new_word.best_choice->rating() <
+          word.best_choice->rating() * rating_ratio &&
+      new_word.best_choice->certainty() >
+          word.best_choice->certainty() - certainty_margin) {
    return true;  // New word is from a dictionary.
  }
  return false;  // New word is no better.
@ -764,7 +715,9 @@ bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
  // (to reduce false positives from cube) or a dictionary vs non-dictionary
  // word.
  (this->*recognizer)(block, row, &lang_word);
-  bool new_is_better = NewWordBetter(*word, lang_word);
+  bool new_is_better = NewWordBetter(*word, lang_word,
+                                     classify_max_rating_ratio,
+                                     classify_max_certainty_margin);
  if (classify_debug_level || cube_debug_level) {
    if (lang_word.best_choice == NULL) {
      tprintf("New result %s better:%s\n",
@ -793,6 +746,7 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
                                           BLOCK* block,
                                           ROW *row,
                                           WERD_RES *word) {
+  clock_t start_t = clock();
  if (classify_debug_level || cube_debug_level) {
    tprintf("Processing word with lang %s at:",
            most_recently_used_->lang.string());
@ -811,12 +765,15 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
  if (!word->tess_failed && word->tess_accepted)
    result_type = "Accepted";
  if (classify_debug_level || cube_debug_level) {
-    tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
+    tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
+            " xht=[%g,%g]\n",
            result_type,
            word->best_choice->unichar_string().string(),
            word->best_choice->rating(),
            word->best_choice->certainty(),
-            word->tess_accepted, word->tess_would_adapt);
+            word->tess_accepted, word->tess_would_adapt,
+            word->best_choice->min_x_height(),
+            word->best_choice->max_x_height());
  }
  if (word->tess_failed || !word->tess_accepted) {
    // Try all the other languages to see if they are any better.
@ -846,6 +803,12 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
      }
    }
  }
+  clock_t ocr_t = clock();
+  if (tessedit_timing_debug) {
+    tprintf("%s (ocr took %.2f sec)\n",
+            word->best_choice->unichar_string().string(),
+            static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
+  }
 }

 /**
@ -860,92 +823,25 @@ void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
    cube_word_pass1(block, row, word);
    return;
  }
+  match_word_pass_n(1, word, row, block);
+  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
+    word->tess_would_adapt = AdaptableWord(word);
+    bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);

-  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
-  BOOL8 adapt_ok;
-  const char *rejmap;
-  inT16 index;
-  STRING mapstr = "";
-
-  check_debug_pt(word, 0);
-  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
-                                    classify_bln_numeric_mode,
-                                    this->textord_use_cjk_fp_model,
-                                    row, block))
-    tess_segment_pass1(word, blob_choices);
-  if (!word->tess_failed) {
-    /*
-       The adaption step used to be here. It has been moved to after
-       make_reject_map so that we know whether the word will be accepted in the
-       first pass or not.   This move will PREVENT adaption to words containing
-       double quotes because the word will not be identical to what tess thinks
-       its best choice is. (See CurrentBestChoiceIs in
-       stopper.cpp which is used by AdaptableWord in
-       adaptmatch.cpp)
-     */
-
-    if (!word->word->flag(W_REP_CHAR)) {
-      // TODO(daria) delete these hacks when replaced by more generic code.
-      // Convert '' (double single) to " (single double).
-      word->fix_quotes(blob_choices);
-      if (tessedit_fix_hyphens)  // turn -- to -
-        word->fix_hyphens(blob_choices);
-
-      word->tess_accepted = tess_acceptable_word(word->best_choice,
-                                                 word->raw_choice);
-
-      word->tess_would_adapt = word->best_choice && word->raw_choice &&
-          AdaptableWord(word->rebuild_word,
-                        *word->best_choice,
-                        *word->raw_choice);
-                                 // Also sets word->done flag
-      make_reject_map(word, blob_choices, row, 1);
-
-      adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
-
-      if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
-        if (!tessedit_tess_adapt_to_rejmap) {
-          rejmap = NULL;
-        } else {
-          ASSERT_HOST(word->reject_map.length() ==
-                      word->best_choice->length());
-
-          for (index = 0; index < word->reject_map.length(); index++) {
-            if (adapt_ok || word->reject_map[index].accepted())
-              mapstr += '1';
-            else
-              mapstr += '0';
-          }
-          rejmap = mapstr.string();
-        }
-        // Send word to adaptive classifier for training.
-        word->BestChoiceToCorrectText();
-        set_word_fonts(word, blob_choices);
-        LearnWord(NULL, rejmap, word);
-        // Mark misadaptions if running blamer.
-        if (word->blamer_bundle != NULL &&
-            word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
-            !ChoiceIsCorrect(*word->uch_set, word->best_choice,
-                             word->blamer_bundle->truth_text)) {
-          word->blamer_bundle->misadaption_debug ="misadapt to word (";
-          word->blamer_bundle->misadaption_debug +=
-              word->best_choice->permuter_name();
-          word->blamer_bundle->misadaption_debug += "): ";
-          word->blamer_bundle->FillDebugString(
-              "", word->best_choice, &(word->blamer_bundle->misadaption_debug));
-          if (wordrec_debug_blamer) {
-            tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
-          }
-        }
+    if (adapt_ok) {
+      // Send word to adaptive classifier for training.
+      word->BestChoiceToCorrectText();
+      LearnWord(NULL, word);
+      // Mark misadaptions if running blamer.
+      if (word->blamer_bundle != NULL) {
+        word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
+                                                 wordrec_debug_blamer);
      }
-
-      if (tessedit_enable_doc_dict)
-        tess_add_doc_word(word->best_choice);
    }
-  }

-  // Save best choices in the WERD_CHOICE if needed
-  word->best_choice->set_blob_choices(blob_choices);
+    if (tessedit_enable_doc_dict && !word->IsAmbiguous())
+      tess_add_doc_word(word->best_choice);
+  }
 }

 // Helper to report the result of the xheight fix.
@ -976,7 +872,7 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
  if (original_misfits == 0)
    return false;
  float new_x_ht = ComputeCompatibleXheight(word);
-  if (new_x_ht > 0.0f) {
+  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
    WERD_RES new_x_ht_word(word->word);
    if (word->blamer_bundle != NULL) {
      new_x_ht_word.blamer_bundle = new BlamerBundle();
@ -984,7 +880,7 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
    }
    new_x_ht_word.x_height = new_x_ht;
    new_x_ht_word.caps_height = 0.0;
-    match_word_pass2(&new_x_ht_word, row, block);
+    match_word_pass_n(2, &new_x_ht_word, row, block);
    if (!new_x_ht_word.tess_failed) {
      int new_misfits = CountMisfitTops(&new_x_ht_word);
      if (debug_x_ht_level >= 1) {
@ -1026,26 +922,24 @@ void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
      tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
    return;

-  bool done_this_pass = false;
  set_global_subloc_code(SUBLOC_NORM);
  check_debug_pt(word, 30);
  if (!word->done || tessedit_training_tess) {
    word->caps_height = 0.0;
    if (word->x_height == 0.0f)
      word->x_height = row->x_height();
-    match_word_pass2(word, row, block);
-    done_this_pass = TRUE;
+    match_word_pass_n(2, word, row, block);
    check_debug_pt(word, 40);
  }

+  SubAndSuperscriptFix(word);
+
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
-    bool accept_new_xht = false;
-    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight()) {
+    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
+        block->classify_rotation().y() == 0.0f) {
      // Use the tops and bottoms since they are available.
-      accept_new_xht = TrainedXheightFix(word, block, row);
+      TrainedXheightFix(word, block, row);
    }
-    if (accept_new_xht)
-      done_this_pass = true;
    // Test for small caps. Word capheight must be close to block xheight,
    // and word must contain no lower case letters, and at least one upper case.
    double small_cap_xheight = block->x_height() * kXHeightCapRatio;
@ -1092,60 +986,38 @@ void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
 * Baseline normalize the word and pass it to Tess.
 */

-void Tesseract::match_word_pass2(WERD_RES *word,  //word to do
-                                 ROW *row,
-                                 BLOCK* block) {
-  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
-
+void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
+                                  ROW *row, BLOCK* block) {
  if (word->SetupForTessRecognition(unicharset, this, BestPix(),
                                    classify_bln_numeric_mode,
-                                    this->textord_use_cjk_fp_model,
+                                    textord_use_cjk_fp_model,
+                                    poly_allow_detailed_fx,
                                    row, block))
-    tess_segment_pass2(word, blob_choices);
+    tess_segment_pass_n(pass_n, word);

  if (!word->tess_failed) {
    if (!word->word->flag (W_REP_CHAR)) {
-      word->fix_quotes(blob_choices);
+       word->fix_quotes();
      if (tessedit_fix_hyphens)
-        word->fix_hyphens(blob_choices);
+        word->fix_hyphens();
      /* Dont trust fix_quotes! - though I think I've fixed the bug */
-      if (word->best_choice->length() != word->box_word->length() ||
-          word->best_choice->length() != blob_choices->length()) {
+      if (word->best_choice->length() != word->box_word->length()) {
        tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
-                " #Blobs=%d; #Choices=%d\n",
+                " #Blobs=%d\n",
                word->best_choice->debug_string().string(),
                word->best_choice->length(),
-                word->box_word->length(), blob_choices->length());
+                word->box_word->length());

      }
-      word->tess_accepted = tess_acceptable_word(word->best_choice,
-                                                 word->raw_choice);
+      word->tess_accepted = tess_acceptable_word(word);

-      make_reject_map (word, blob_choices, row, 2);
+      // Also sets word->done flag
+      make_reject_map(word, row, pass_n);
    }
  }
+  set_word_fonts(word);

-  // Save best choices in the WERD_CHOICE if needed
-  word->best_choice->set_blob_choices(blob_choices);
-  set_word_fonts(word, blob_choices);
-
-  assert (word->raw_choice != NULL);
-}
-
-// Helper to find the BLOB_CHOICE in the bc_list that matches the given
-// unichar_id, or NULL if there is no match.
-static BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
-                                       BLOB_CHOICE_LIST* bc_list) {
-  // Find the corresponding best BLOB_CHOICE.
-  BLOB_CHOICE_IT choice_it(bc_list);
-  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
-       choice_it.forward()) {
-    BLOB_CHOICE* choice = choice_it.data();
-    if (choice->unichar_id() == char_id) {
-      return choice;
-    }
-  }
-  return NULL;
+  ASSERT_HOST(word->raw_choice != NULL);
 }

 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
@ -1154,9 +1026,9 @@ static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
                                           WERD_RES* word_res) {
  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
  BLOB_CHOICE* best_choice = NULL;
-  BLOB_CHOICE_LIST_C_IT bc_it(word_res->best_choice->blob_choices());
-  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
-    BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
+  for (int i = 0; i < word_res->best_choice->length(); ++i) {
+    BLOB_CHOICE* choice = FindMatchingChoice(char_id,
+                                             word_res->GetBlobChoices(i));
    if (choice != NULL) {
      if (best_choice == NULL || choice->rating() < best_choice->rating())
        best_choice = choice;
@ -1171,12 +1043,11 @@ static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
                                  WERD_RES* word_res) {
  WERD_CHOICE* word = word_res->best_choice;
-  BLOB_CHOICE_LIST_C_IT bc_it(word->blob_choices());
-  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+  for (int i = 0; i < word_res->best_choice->length(); ++i) {
    BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
-                                             bc_it.data());
+                                             word_res->GetBlobChoices(i));
    if (choice == NULL) {
-      BLOB_CHOICE_IT choice_it(bc_it.data());
+      BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
      choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
    }
  }
@ -1267,7 +1138,8 @@ void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
    // Setup the single char WERD_RES
    if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
                                          false,
-                                          this->textord_use_cjk_fp_model,
+                                          textord_use_cjk_fp_model,
+                                          poly_allow_detailed_fx,
                                          page_res_it->row()->row,
                                          page_res_it->block()->block)) {
      rep_word->CloneChoppedToRebuild();
@ -1494,16 +1366,14 @@ static void find_modal_font(           //good chars in word
 *
 * Get the fonts for the word.
 */
-void Tesseract::set_word_fonts(WERD_RES *word,
-                               BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  if (blob_choices == NULL) return;
+void Tesseract::set_word_fonts(WERD_RES *word) {
  // Don't try to set the word fonts for a cube word, as the configs
  // will be meaningless.
  if (word->chopped_word == NULL) return;
+  ASSERT_HOST(word->best_choice != NULL);

  inT32 index;                   // char id index
                                 // character iterator
-  BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
  BLOB_CHOICE_IT choice_it;      // choice iterator
  int fontinfo_size = get_fontinfo_table().size();
  int fontset_size = get_fontset_table().size();
@ -1516,10 +1386,9 @@ void Tesseract::set_word_fonts(WERD_RES *word,
    word->best_choice_fontinfo_ids.clear();
  }
  // Compute the modal font for the word
-  for (char_it.mark_cycle_pt(), index = 0;
-       !char_it.cycled_list(); ++index, char_it.forward()) {
+  for (index = 0; index < word->best_choice->length(); ++index) {
    UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
-    choice_it.set_to_list(char_it.data());
+    choice_it.set_to_list(word->GetBlobChoices(index));
    if (tessedit_debug_fonts) {
      tprintf("Examining fonts in %s\n",
              word->best_choice->debug_string().string());
--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@ -144,54 +144,6 @@ bool Tesseract::create_cube_box_word(Boxa *char_boxes,
  return true;
 }

-/**********************************************************************
- * create_werd_choice
- *
- **********************************************************************/
-static WERD_CHOICE *create_werd_choice(
-                                       CharSamp** char_samples,
-                                       int num_chars,
-                                       const char* str,
-                                       float certainty,
-                                       const UNICHARSET &unicharset,
-                                       CharSet* cube_char_set
-                                       ) {
-  // Insert unichar ids into WERD_CHOICE
-  WERD_CHOICE *werd_choice = new WERD_CHOICE(&unicharset, num_chars);
-  // within a word, cube recognizes the word in reading order.
-  werd_choice->set_unichars_in_script_order(true);
-  ASSERT_HOST(werd_choice != NULL);
-  UNICHAR_ID uch_id;
-  for (int i = 0; i < num_chars; ++i) {
-    uch_id = cube_char_set->UnicharID(char_samples[i]->StrLabel());
-    if (uch_id != INVALID_UNICHAR_ID)
-      werd_choice->append_unichar_id_space_allocated(
-          uch_id, 1, 0.0, certainty);
-  }
-
-  BLOB_CHOICE *blob_choice;
-  BLOB_CHOICE_LIST *choices_list;
-  BLOB_CHOICE_IT choices_list_it;
-  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
-  BLOB_CHOICE_LIST_C_IT blob_choices_it;
-  blob_choices_it.set_to_list(blob_choices);
-
-  for (int i = 0; i < werd_choice->length(); ++i) {
-    // Create new BLOB_CHOICE_LIST for this unichar
-    choices_list = new BLOB_CHOICE_LIST();
-    choices_list_it.set_to_list(choices_list);
-    // Add a single BLOB_CHOICE to the list
-    blob_choice = new BLOB_CHOICE(werd_choice->unichar_id(i),
-                                  0.0, certainty, -1, -1, 0, 0, 0, false);
-    choices_list_it.add_after_then_move(blob_choice);
-    // Add list to the clist
-    blob_choices_it.add_to_end(choices_list);
-  }
-  werd_choice->set_certainty(certainty);
-  werd_choice->set_blob_choices(blob_choices);
-  return werd_choice;
-}
-
 /**********************************************************************
 * init_cube_objects
 *
@ -419,29 +371,32 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
    return false;
  }

-  // Create cube's best choice.
-  WERD_CHOICE* cube_werd_choice = create_werd_choice(
-      char_samples, num_chars, cube_best_str.c_str(), cube_certainty,
-      unicharset, cube_cntxt_->CharacterSet());
-  delete []char_samples;
+  // Fill tesseract result's fields with cube results
+  fill_werd_res(cube_box_word, cube_best_str.c_str(), word);

-  if (!cube_werd_choice) {
-    if (cube_debug_level > 0) {
-      tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
-              "create cube WERD_CHOICE\n");
-    }
-    word->SetupFake(unicharset);
-    return false;
+  // Create cube's best choice.
+  BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars];
+  for (int i = 0; i < num_chars; ++i) {
+    UNICHAR_ID uch_id =
+        cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
+    choices[i] = new BLOB_CHOICE(uch_id, 0.0, cube_certainty, -1, -1,
+                                 0, 0, 0, 0, BCC_STATIC_CLASSIFIER);
  }
+  word->FakeClassifyWord(num_chars, choices);
+  // within a word, cube recognizes the word in reading order.
+  word->best_choice->set_unichars_in_script_order(true);
+  delete [] choices;
+  delete [] char_samples;
+
+  // Some sanity checks
+  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
+
  if (cube_debug_level || classify_debug_level) {
    tprintf("Cube result: %s r=%g, c=%g\n",
-            cube_werd_choice->unichar_string().string(),
-            cube_werd_choice->rating(),
-            cube_werd_choice->certainty());
+            word->best_choice->unichar_string().string(),
+            word->best_choice->rating(),
+            word->best_choice->certainty());
  }
-
-  // Fill tesseract result's fields with cube results
-  fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word);
  return true;
 }

@ -452,13 +407,8 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
 *
 **********************************************************************/
 void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
-                              WERD_CHOICE* cube_werd_choice,
                              const char* cube_best_str,
                              WERD_RES* tess_werd_res) {
-  // Replace tesseract results's best choice with cube's
-  tess_werd_res->best_choice = cube_werd_choice;
-  tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice);
-
  delete tess_werd_res->box_word;
  tess_werd_res->box_word = new BoxWord(cube_box_word);
  tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
@ -466,23 +416,13 @@ void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
  // Fill text and remaining fields
  tess_werd_res->word->set_text(cube_best_str);
  tess_werd_res->tess_failed = FALSE;
-  tess_werd_res->tess_accepted =
-      tess_acceptable_word(tess_werd_res->best_choice,
-                           tess_werd_res->raw_choice);
+  tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res);
  // There is no output word, so we can' call AdaptableWord, but then I don't
  // think we need to. Fudge the result with accepted.
  tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;

-  // Initialize the reject_map and set it to done, i.e., ignore all of
-  // tesseract's tests for rejection
-  tess_werd_res->reject_map.initialise(cube_werd_choice->length());
+  // Set word to done, i.e., ignore all of tesseract's tests for rejection
  tess_werd_res->done = tess_werd_res->tess_accepted;
-
-  // Some sanity checks
-  ASSERT_HOST(tess_werd_res->best_choice->length() ==
-              tess_werd_res->best_choice->blob_choices()->length());
-  ASSERT_HOST(tess_werd_res->best_choice->length() ==
-              tess_werd_res->reject_map.length());
 }

 }  // namespace tesseract
--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@ -23,7 +23,6 @@

 #include          <ctype.h>
 #include          "docqual.h"
-#include          "tfacep.h"
 #include          "reject.h"
 #include          "tesscallback.h"
 #include          "tessvars.h"
@ -66,7 +65,7 @@ struct DocQualCallbacks {
 *************************************************************************/
 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
  if (word->bln_boxes == NULL ||
-      word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
+      word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
    return 0;

  DocQualCallbacks cb(word);
@ -81,8 +80,8 @@ inT16 Tesseract::word_outline_errs(WERD_RES *word) {
  inT16 err_count = 0;

  if (word->rebuild_word != NULL) {
-    TBLOB* blob = word->rebuild_word->blobs;
-    for (; blob != NULL; blob = blob->next) {
+    for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
+      TBLOB* blob = word->rebuild_word->blobs[b];
      err_count += count_outline_errs(word->best_choice->unichar_string()[i],
                                      blob->NumOutlines());
      i++;
@ -101,7 +100,7 @@ void Tesseract::word_char_quality(WERD_RES *word,
                                  inT16 *match_count,
                                  inT16 *accepted_match_count) {
  if (word->bln_boxes == NULL ||
-      word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
+      word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
    return;

  DocQualCallbacks cb(word);
@ -118,7 +117,7 @@ void Tesseract::word_char_quality(WERD_RES *word,
 *************************************************************************/
 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
  if (word->bln_boxes == NULL ||
-      word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
+      word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
    return;

  DocQualCallbacks cb(word);
@ -990,7 +989,8 @@ BOOL8 Tesseract::noise_outlines(TWERD *word) {
  inT16 max_dimension;
  float small_limit = kBlnXHeight * crunch_small_outlines_size;

-  for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
+  for (int b = 0; b < word->NumBlobs(); ++b) {
+    TBLOB* blob = word->blobs[b];
    for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
      outline_count++;
      box = ol->bounding_box();
@ -1002,6 +1002,7 @@ BOOL8 Tesseract::noise_outlines(TWERD *word) {
        small_outline_count++;
    }
  }
-  return (small_outline_count >= outline_count);
+  return small_outline_count >= outline_count;
 }
+
 }  // namespace tesseract
--- a/ccmain/equationdetect.cpp
+++ b/ccmain/equationdetect.cpp
@ -19,7 +19,7 @@

 #ifdef _MSC_VER
 #pragma warning(disable:4244)  // Conversion warnings
-#include "mathfix.h"
+#include <mathfix.h>
 #endif

 #ifdef __MINGW32__
@ -173,21 +173,21 @@ void EquationDetect::IdentifySpecialText(

  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
  C_BLOB* blob = blobnbox->cblob();
-  TBLOB* tblob = TBLOB::PolygonalCopy(blob);
+  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
+  // each classifier here, as they may require different PolygonalCopy.
+  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
  const TBOX& box = tblob->bounding_box();

  // Normalize the blob. Set the origin to the place we want to be the
  // bottom-middle, and scaling is to make the height the x-height.
  float scaling = static_cast<float>(kBlnXHeight) / box.height();
-  DENORM denorm;
  float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
-  denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0,
-                            x_orig, y_orig, scaling, scaling,
-                            0.0f, static_cast<float>(kBlnBaselineOffset));
  TBLOB* normed_blob = new TBLOB(*tblob);
-  normed_blob->Normalize(denorm);
-  equ_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_equ, NULL);
-  lang_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL);
+  normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
+                         0.0f, static_cast<float>(kBlnBaselineOffset),
+                         false, NULL);
+  equ_tesseract_->AdaptiveClassifier(normed_blob, &ratings_equ, NULL);
+  lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang, NULL);
  delete normed_blob;
  delete tblob;

--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@ -35,6 +35,7 @@
 #define MAXSPACING      128      /*max expected spacing in pix */

 namespace tesseract {
+
 /**
 * @name fix_fuzzy_spaces()
 * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
@ -183,7 +184,7 @@ void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
    src_wd = src_it.data();
    if (!src_wd->combination) {
-      new_wd = new WERD_RES(*src_wd);
+      new_wd = WERD_RES::deep_copy(src_wd);
      new_wd->combination = FALSE;
      new_wd->part_of_combo = FALSE;
      new_it.add_after_then_move(new_wd);
@ -502,86 +503,6 @@ void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
  }
 }

-
-/**
- * @name uniformly_spaced()
- * Return true if one of the following are true:
- * - All inter-char gaps are the same width
- * - The largest gap is no larger than twice the mean/median of the others
- * - The largest gap is < normalised_max_nonspace
- * **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
- */
-BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) {
-  TBOX box;
-  inT16 prev_right = -MAX_INT16;
-  inT16 gap;
-  inT16 max_gap = -MAX_INT16;
-  inT16 max_gap_count = 0;
-  STATS gap_stats(0, MAXSPACING);
-  BOOL8 result;
-  const ROW *row = word->denorm.row();
-  float max_non_space;
-  float normalised_max_nonspace;
-  inT16 i = 0;
-  inT16 offset = 0;
-  STRING punct_chars = "\"`',.:;";
-
-  for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
-       blob = blob->next) {
-    box = blob->bounding_box();
-    if ((prev_right > -MAX_INT16) &&
-        (!punct_chars.contains(
-             word->best_choice->unichar_string()
-                 [offset - word->best_choice->unichar_lengths()[i - 1]]) &&
-         !punct_chars.contains(
-             word->best_choice->unichar_string()[offset]))) {
-      gap = box.left() - prev_right;
-      if (gap < max_gap) {
-        gap_stats.add(gap, 1);
-      } else if (gap == max_gap) {
-        max_gap_count++;
-      } else {
-        if (max_gap_count > 0)
-          gap_stats.add(max_gap, max_gap_count);
-        max_gap = gap;
-        max_gap_count = 1;
-      }
-    }
-    prev_right = box.right();
-    offset += word->best_choice->unichar_lengths()[i++];
-  }
-
-  max_non_space = (row->space() + 3 * row->kern()) / 4;
-  normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();
-
-  result = (
-      gap_stats.get_total() == 0 ||
-      max_gap <= normalised_max_nonspace ||
-      (gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
-      (gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
-  #ifndef SECURE_NAMES
-  if ((debug_fix_space_level > 1)) {
-    if (result) {
-      tprintf(
-          "ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
-          "total=%d mean=%f median=%f\n",
-          word->best_choice->unichar_string().string(), normalised_max_nonspace,
-          max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
-          gap_stats.median());
-    } else {
-      tprintf(
-          "REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
-          "total=%d mean=%f median=%f\n",
-          word->best_choice->unichar_string().string(), normalised_max_nonspace,
-          max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
-          gap_stats.median());
-    }
-  }
-  #endif
-
-  return result;
-}
-
 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
  if (word->done)
    return TRUE;
@ -655,7 +576,6 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
  WERD_RES_LIST current_perm;
  WERD_RES_IT current_perm_it(&current_perm);
  WERD_RES *old_word_res;
-  WERD_RES *new_word_res;
  inT16 current_score;
  BOOL8 improved = FALSE;

@ -663,12 +583,12 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,

  dump_words(best_perm, best_score, 1, improved);

-  new_word_res = new WERD_RES;
  old_word_res = best_perm_it.data();
+  // Even deep_copy doesn't copy the underlying WERD unless its combination
+  // flag is true!.
  old_word_res->combination = TRUE;   // Kludge to force deep copy
-  *new_word_res = *old_word_res;      // deep copy
+  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
  old_word_res->combination = FALSE;  // Undo kludge
-  current_perm_it.add_to_end(new_word_res);

  break_noisiest_blob_word(current_perm);

@ -774,7 +694,6 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
  if (word_res->rebuild_word == NULL)
    return -1;  // Can't handle cube words.

-  TBLOB* blob = word_res->rebuild_word->blobs;
  // Normalised.
  int blob_count = word_res->box_word->length();
  ASSERT_HOST(blob_count <= 512);
@ -789,7 +708,8 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
            word_res->best_choice->unichar_string().string());
  #endif

-  for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
+  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
+    TBLOB* blob = word_res->rebuild_word->blobs[i];
    if (word_res->reject_map[i].accepted())
      noise_score[i] = non_noise_limit;
    else
@ -929,10 +849,10 @@ inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
        word->best_choice->permuter() == USER_DAWG_PERM ||
        safe_dict_word(word) > 0) {
-      TBLOB* blob = word->rebuild_word->blobs;
+      int num_blobs = word->rebuild_word->NumBlobs();
      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
-      for (i = 0; i < word->best_choice->length() && blob != NULL;
-           ++i, blob = blob->next) {
+      for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
+        TBLOB* blob = word->rebuild_word->blobs[i];
        if (word->best_choice->unichar_id(i) == space ||
            blob_noise_score(blob) < small_limit) {
          score -= 1;  // penalise possibly erroneous non-space
--- a/ccmain/fixxht.cpp
+++ b/ccmain/fixxht.cpp
@ -62,9 +62,9 @@ const int kMaxCharTopRange = 48;
 // Returns the number of misfit blob tops in this word.
 int Tesseract::CountMisfitTops(WERD_RES *word_res) {
  int bad_blobs = 0;
-  TBLOB* blob = word_res->rebuild_word->blobs;
-  int blob_id = 0;
-  for (; blob != NULL; blob = blob->next, ++blob_id) {
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+    TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
      int top = blob->bounding_box().top();
@ -94,9 +94,9 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {
 // See comment above for overall algorithm.
 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
  STATS top_stats(0, MAX_UINT8);
-  TBLOB* blob = word_res->rebuild_word->blobs;
-  int blob_id = 0;
-  for (; blob != NULL; blob = blob->next, ++blob_id) {
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+    TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
      int top = blob->bounding_box().top();
--- a/ccmain/imgscale.cpp
+++ b/ccmain/imgscale.cpp
@ -33,7 +33,7 @@

 #include <stdio.h>
 #include <stdlib.h>
-#include "errcode.h"
+#include "globaloc.h"  // For err_exit.

 #define f(xc, yc) ((xc - factor*yc)*(xc - factor*yc))

--- a/ccmain/ltrresultiterator.cpp
+++ b/ccmain/ltrresultiterator.cpp
@ -132,23 +132,7 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
     ++certainty_count;
      break;
    case RIL_SYMBOL:
-      BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
-      if (choices != NULL) {
-        BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
-        for (int blob = 0; blob < blob_index_; ++blob)
-          blob_choices_it.forward();
-        BLOB_CHOICE_IT choice_it(blob_choices_it.data());
-        for (choice_it.mark_cycle_pt();
-             !choice_it.cycled_list();
-             choice_it.forward()) {
-          if (choice_it.data()->unichar_id() ==
-              best_choice->unichar_id(blob_index_))
-            break;
-        }
-        mean_certainty += choice_it.data()->certainty();
-      } else {
-        mean_certainty += best_choice->certainty();
-      }
+      mean_certainty += best_choice->certainty(blob_index_);
      ++certainty_count;
  }
  if (certainty_count > 0) {
@ -237,55 +221,83 @@ bool LTRResultIterator::WordIsNumeric() const {

 // Returns true if the word contains blamer information.
 bool LTRResultIterator::HasBlamerInfo() const {
-  return (it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
-           (it_->word()->blamer_bundle->debug.length() > 0 ||
-            it_->word()->blamer_bundle->misadaption_debug.length() > 0));
+  return it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
+         it_->word()->blamer_bundle->HasDebugInfo();
 }

 // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
 // of the current word.
-void *LTRResultIterator::GetParamsTrainingBundle() const {
+const void *LTRResultIterator::GetParamsTrainingBundle() const {
  return (it_->word() != NULL && it_->word()->blamer_bundle != NULL) ?
-      &(it_->word()->blamer_bundle->params_training_bundle) : NULL;
+      &(it_->word()->blamer_bundle->params_training_bundle()) : NULL;
 }

 // Returns the pointer to the string with blamer information for this word.
 // Assumes that the word's blamer_bundle is not NULL.
 const char *LTRResultIterator::GetBlamerDebug() const {
-  return it_->word()->blamer_bundle->debug.string();
+  return it_->word()->blamer_bundle->debug().string();
 }

 // Returns the pointer to the string with misadaption information for this word.
 // Assumes that the word's blamer_bundle is not NULL.
 const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
-  return it_->word()->blamer_bundle->misadaption_debug.string();
+  return it_->word()->blamer_bundle->misadaption_debug().string();
+}
+
+// Returns true if a truth string was recorded for the current word.
+bool LTRResultIterator::HasTruthString() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  if (it_->word()->blamer_bundle == NULL ||
+      it_->word()->blamer_bundle->NoTruth()) {
+    return false;  // no truth information for this word
+  }
+  return true;
+}
+
+// Returns true if the given string is equivalent to the truth string for
+// the current word.
+bool LTRResultIterator::EquivalentToTruth(const char *str) const {
+  if (!HasTruthString()) return false;
+  ASSERT_HOST(it_->word()->uch_set != NULL);
+  WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
+  return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
 }

 // Returns the null terminated UTF-8 encoded truth string for the current word.
 // Use delete [] to free after use.
 char* LTRResultIterator::WordTruthUTF8Text() const {
-  if (it_->word() == NULL) return NULL;  // Already at the end!
-  if (it_->word()->blamer_bundle == NULL ||
-      it_->word()->blamer_bundle->incorrect_result_reason == IRR_NO_TRUTH) {
-    return NULL;  // no truth information for this word
-  }
-  const GenericVector<STRING> &truth_vec =
-      it_->word()->blamer_bundle->truth_text;
-  STRING truth_text;
-  for (int i = 0; i < truth_vec.size(); ++i) truth_text += truth_vec[i];
+  if (!HasTruthString()) return NULL;
+  STRING truth_text = it_->word()->blamer_bundle->TruthString();
  int length = truth_text.length() + 1;
  char* result = new char[length];
  strncpy(result, truth_text.string(), length);
  return result;
 }

+// Returns the null terminated UTF-8 encoded normalized OCR string for the
+// current word. Use delete [] to free after use.
+char* LTRResultIterator::WordNormedUTF8Text() const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  STRING ocr_text;
+  WERD_CHOICE* best_choice = it_->word()->best_choice;
+  const UNICHARSET *unicharset = it_->word()->uch_set;
+  ASSERT_HOST(best_choice != NULL);
+  for (int i = 0; i < best_choice->length(); ++i) {
+    ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
+  }
+  int length = ocr_text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, ocr_text.string(), length);
+  return result;
+}
+
 // Returns a pointer to serialized choice lattice.
 // Fills lattice_size with the number of bytes in lattice data.
 const char *LTRResultIterator::WordLattice(int *lattice_size) const {
  if (it_->word() == NULL) return NULL;  // Already at the end!
  if (it_->word()->blamer_bundle == NULL) return NULL;
-  *lattice_size = it_->word()->blamer_bundle->lattice_size;
-  return it_->word()->blamer_bundle->lattice_data;
+  *lattice_size = it_->word()->blamer_bundle->lattice_size();
+  return it_->word()->blamer_bundle->lattice_data();
 }

 // Returns true if the current symbol is a superscript.
@ -293,7 +305,8 @@ const char *LTRResultIterator::WordLattice(int *lattice_size) const {
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsSuperscript() const {
  if (cblob_it_ == NULL && it_->word() != NULL)
-    return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
+    return it_->word()->best_choice->BlobPosition(blob_index_) ==
+        SP_SUPERSCRIPT;
  return false;
 }

@ -302,7 +315,7 @@ bool LTRResultIterator::SymbolIsSuperscript() const {
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsSubscript() const {
  if (cblob_it_ == NULL && it_->word() != NULL)
-    return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
  return false;
 }

@ -311,7 +324,7 @@ bool LTRResultIterator::SymbolIsSubscript() const {
 // this will return the attributes of the first symbol in that word.
 bool LTRResultIterator::SymbolIsDropcap() const {
  if (cblob_it_ == NULL && it_->word() != NULL)
-    return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
  return false;
 }

@ -319,13 +332,11 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
  ASSERT_HOST(result_it.it_->word() != NULL);
  word_res_ = result_it.it_->word();
  PAGE_RES_IT res_it(*result_it.it_);
-  WERD_CHOICE* best_choice = word_res_->best_choice;
-  BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+  BLOB_CHOICE_LIST* choices = NULL;
+  if (word_res_->ratings != NULL)
+    choices = word_res_->GetBlobChoices(result_it.blob_index_);
  if (choices != NULL && !choices->empty()) {
-    BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
-    for (int blob = 0; blob < result_it.blob_index_; ++blob)
-      blob_choices_it.forward();
-    choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
+    choice_it_ = new BLOB_CHOICE_IT(choices);
    choice_it_->mark_cycle_pt();
  } else {
    choice_it_ = NULL;
--- a/ccmain/ltrresultiterator.h
+++ b/ccmain/ltrresultiterator.h
@ -23,7 +23,7 @@

 #include "platform.h"
 #include "pageiterator.h"
-#include "unicharset.h"
+#include "unichar.h"

 class BLOB_CHOICE_IT;
 class WERD_RES;
@ -128,7 +128,7 @@ class TESS_API LTRResultIterator : public PageIterator {

  // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
  // of the current word.
-  void *GetParamsTrainingBundle() const;
+  const void *GetParamsTrainingBundle() const;

  // Returns a pointer to the string with blamer information for this word.
  // Assumes that the word's blamer_bundle is not NULL.
@ -138,10 +138,21 @@ class TESS_API LTRResultIterator : public PageIterator {
  // Assumes that the word's blamer_bundle is not NULL.
  const char *GetBlamerMisadaptionDebug() const;

+  // Returns true if a truth string was recorded for the current word.
+  bool HasTruthString() const;
+
+  // Returns true if the given string is equivalent to the truth string for
+  // the current word.
+  bool EquivalentToTruth(const char *str) const;
+
  // Returns a null terminated UTF-8 encoded truth string for the current word.
  // Use delete [] to free after use.
  char* WordTruthUTF8Text() const;

+  // Returns a null terminated UTF-8 encoded normalized OCR string for the
+  // current word. Use delete [] to free after use.
+  char* WordNormedUTF8Text() const;
+
  // Returns a pointer to serialized choice lattice.
  // Fills lattice_size with the number of bytes in lattice data.
  const char *WordLattice(int *lattice_size) const;
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@ -29,14 +29,12 @@
 #include          <errno.h>
 #endif
 #include "helpers.h"
-#include "tfacep.h"
 #include "tessvars.h"
 #include "control.h"
 #include "secname.h"
 #include "reject.h"
 #include "docqual.h"
 #include "output.h"
-#include "bestfirst.h"
 #include "globals.h"
 #include "tesseractclass.h"

@ -242,13 +240,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
-    word->best_choice->remove_unichar_id(0);
-    if (word->best_choice->blob_choices() != NULL) {
-      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
-      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
-    }
-    word->reject_map.remove_pos (0);
-    word->box_word->DeleteBox(0);
+    word->MergeAdjacentBlobs(0);
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
--- a/ccmain/pageiterator.cpp
+++ b/ccmain/pageiterator.cpp
@ -303,16 +303,22 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
 bool PageIterator::BoundingBox(PageIteratorLevel level,
                               int* left, int* top,
                               int* right, int* bottom) const {
+  return BoundingBox(level, 0, left, top, right, bottom);
+}
+
+bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
+                               int* left, int* top,
+                               int* right, int* bottom) const {
  if (!BoundingBoxInternal(level, left, top, right, bottom))
    return false;
  // Convert to the coordinate system of the original image.
-  *left = ClipToRange(*left / scale_ + rect_left_,
+  *left = ClipToRange(*left / scale_ + rect_left_ - padding,
                      rect_left_, rect_left_ + rect_width_);
-  *top = ClipToRange(*top / scale_ + rect_top_,
+  *top = ClipToRange(*top / scale_ + rect_top_ - padding,
                     rect_top_, rect_top_ + rect_height_);
-  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_,
+  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
                       *left, rect_left_ + rect_width_);
-  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_,
+  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
                        *top, rect_top_ + rect_height_);
  return true;
 }
@ -546,14 +552,15 @@ void PageIterator::BeginWord(int offset) {
    // Recognition has been done, so we are using the box_word, which
    // is already baseline denormalized.
    word_length_ = word_res->best_choice->length();
-    ASSERT_HOST(word_res->box_word != NULL);
-    if (word_res->box_word->length() != word_length_) {
-      tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
-              word_length_, word_res->best_choice->unichar_string().string(),
-              word_res->box_word->length());
-      word_res->box_word->bounding_box().print();
+    if (word_res->box_word != NULL) {
+      if (word_res->box_word->length() != word_length_) {
+        tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
+                word_length_, word_res->best_choice->unichar_string().string(),
+                word_res->box_word->length());
+        word_res->box_word->bounding_box().print();
+      }
+      ASSERT_HOST(word_res->box_word->length() == word_length_);
    }
-    ASSERT_HOST(word_res->box_word->length() == word_length_);
    word_ = NULL;
    // We will be iterating the box_word.
    if (cblob_it_ != NULL) {
@ -574,4 +581,13 @@ void PageIterator::BeginWord(int offset) {
  }
 }

+bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
+  if (it_->word() != NULL) {
+    it_->word()->blamer_bundle = blamer_bundle;
+    return true;
+  } else {
+    return false;
+  }
+}
+
 }  // namespace tesseract.
--- a/ccmain/pageiterator.h
+++ b/ccmain/pageiterator.h
@ -24,6 +24,7 @@
 #include "publictypes.h"
 #include "platform.h"

+class BlamerBundle;
 class C_BLOB_IT;
 class PBLOB_IT;
 class PAGE_RES;
@ -189,6 +190,8 @@ class TESS_API PageIterator {
   */
  bool BoundingBox(PageIteratorLevel level,
                   int* left, int* top, int* right, int* bottom) const;
+  bool BoundingBox(PageIteratorLevel level, const int padding,
+                   int* left, int* top, int* right, int* bottom) const;
  /**
   * Returns the bounding rectangle of the object in a coordinate system of the
   * working image rectangle having its origin at (rect_left_, rect_top_) with
@ -282,6 +285,12 @@ class TESS_API PageIterator {
                     bool *is_crown,
                     int *first_line_indent) const;

+  // If the current WERD_RES (it_->word()) is not NULL, sets the BlamerBundle
+  // of the current word to the given pointer (takes ownership of the pointer)
+  // and returns true.
+  // Can only be used when iterating on the word level.
+  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
+
 protected:
  /**
   * Sets up the internal data for iterating the blobs of a new word, then
--- a/ccmain/paragraphs.cpp
+++ b/ccmain/paragraphs.cpp
@ -16,8 +16,8 @@
 ** limitations under the License.
 *
 **********************************************************************/
-#ifdef _MSC_VER		
-#define __func__ __FUNCTION__		
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
 #endif

 #include <ctype.h>
@ -40,11 +40,6 @@

 namespace tesseract {

-// The tab vectors for a given line should be ignored if both its tab vectors
-// are infrequent, specifically, if both tab vectors appear at most once per
-// kStrayLinePer lines in a block.
-const int kStrayLinePer = 6;
-
 // Special "weak" ParagraphModels.
 const ParagraphModel *kCrownLeft
    = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
@ -727,7 +722,15 @@ void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
  //   tab stop is frequent.
  SimpleClusterer lefts(tolerance);
  SimpleClusterer rights(tolerance);
-  int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
+
+  // Outlier elimination.  We might want to switch this to test outlier-ness
+  // based on how strange a position an outlier is in instead of or in addition
+  // to how rare it is.  These outliers get re-added if we end up having too
+  // few tab stops, to work with, however.
+  int infrequent_enough_to_ignore = 0;
+  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
+  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
+
  for (int i = row_start; i < row_end; i++) {
    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
@ -739,6 +742,54 @@ void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
  }
  lefts.GetClusters(left_tabs);
  rights.GetClusters(right_tabs);
+
+  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
+      (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
+    // One side is really ragged, and the other only has one tab stop,
+    // so those "insignificant outliers" are probably important, actually.
+    // This often happens on a page of an index.  Add back in the ones
+    // we omitted in the first pass.
+    for (int i = row_start; i < row_end; i++) {
+      int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
+      int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
+      if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
+            initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
+        lefts.Add((*rows)[i].lindent_);
+        rights.Add((*rows)[i].rindent_);
+      }
+    }
+  }
+  lefts.GetClusters(left_tabs);
+  rights.GetClusters(right_tabs);
+
+  // If one side is almost a two-indent aligned side, and the other clearly
+  // isn't, try to prune out the least frequent tab stop from that side.
+  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
+    int to_prune = -1;
+    for (int i = left_tabs->size() - 1; i >= 0; i--) {
+      if (to_prune < 0 ||
+          (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
+        to_prune = i;
+      }
+    }
+    if (to_prune >= 0 &&
+        (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
+      left_tabs->remove(to_prune);
+    }
+  }
+  if (right_tabs->size() == 3 && right_tabs->size() >= 4) {
+    int to_prune = -1;
+    for (int i = right_tabs->size() - 1; i >= 0; i--) {
+      if (to_prune < 0 ||
+          (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
+        to_prune = i;
+      }
+    }
+    if (to_prune >= 0 &&
+        (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
+      right_tabs->remove(to_prune);
+    }
+  }
 }

 // Given a paragraph model mark rows[row_start, row_end) as said model
@ -816,6 +867,11 @@ struct GeometricClassifierState {
    tolerance = InterwordSpace(*r, r_start, r_end);
    CalculateTabStops(r, r_start, r_end, tolerance,
                      &left_tabs, &right_tabs);
+    if (debug_level >= 3) {
+      tprintf("Geometry: TabStop cluster tolerance = %d; "
+              "%d left tabs; %d right tabs\n",
+              tolerance, left_tabs.size(), right_tabs.size());
+    }
    ltr = (*r)[r_start].ri_->ltr;
  }

@ -1079,16 +1135,18 @@ void GeometricClassify(int debug_level,
    firsts[s.AlignsideTabIndex(s.row_start)]++;
    // For each line, if the first word would have fit on the previous
    // line count it as a likely paragraph start line.
+    bool jam_packed = true;
    for (int i = s.row_start + 1; i < s.row_end; i++) {
      if (s.FirstWordWouldHaveFit(i - 1, i)) {
        firsts[s.AlignsideTabIndex(i)]++;
+        jam_packed = false;
      }
    }
    // Make an extra accounting for the last line of the paragraph just
    // in case it's the only short line in the block.  That is, take its
    // first word as typical and see if this looks like the *last* line
    // of a paragraph.  If so, mark the *other* indent as probably a first.
-    if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
+    if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
    }

@ -1543,24 +1601,26 @@ void RecomputeMarginsAndClearHypotheses(
  }
 }

-// Return the minimum inter-word space in rows[row_start, row_end).
+// Return the median inter-word space in rows[row_start, row_end).
 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
                   int row_start, int row_end) {
  if (row_end < row_start + 1) return 1;
-  bool legit = false;
-  int natural_space = rows[row_start].ri_->average_interword_space;
+  int word_height = (rows[row_start].ri_->lword_box.height() +
+                     rows[row_end - 1].ri_->lword_box.height()) / 2;
+  int word_width = (rows[row_start].ri_->lword_box.width() +
+                    rows[row_end - 1].ri_->lword_box.width())  / 2;
+  STATS spacing_widths(0, 5 + word_width);
  for (int i = row_start; i < row_end; i++) {
    if (rows[i].ri_->num_words > 1) {
-      if (!legit) {
-        natural_space = rows[i].ri_->average_interword_space;
-        legit = true;
-      } else {
-        if (rows[i].ri_->average_interword_space < natural_space)
-          natural_space = rows[i].ri_->average_interword_space;
-      }
+      spacing_widths.add(rows[i].ri_->average_interword_space, 1);
    }
  }
-  return natural_space;
+  int minimum_reasonable_space = word_height / 3;
+  if (minimum_reasonable_space < 2)
+    minimum_reasonable_space = 2;
+  int median = spacing_widths.median();
+  return (median > minimum_reasonable_space)
+      ? median : minimum_reasonable_space;
 }

 // Return whether the first word on the after line can fit in the space at
@ -2274,6 +2334,7 @@ void DetectParagraphs(int debug_level,
    GeometricClassify(debug_level, &rows,
                      leftovers[i].begin, leftovers[i].end, &theory);
  }
+
  // Undo any flush models for which there's little evidence.
  DowngradeWeakestToCrowns(debug_level, &theory, &rows);

--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@ -23,7 +23,6 @@
 #include "control.h"
 #include "cutil.h"
 #include "host.h"
-#include "permute.h"
 #include "ratngs.h"
 #include "reject.h"
 #include "stopper.h"
@ -38,10 +37,6 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
  if (tessedit_ambigs_training) {
    tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
    tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
-    save_blob_choices.set_value(1);              // save individual char choices
-    getDict().save_raw_choices.set_value(1);     // save raw choices
-    getDict().permute_only_top.set_value(true);  // use only top choice permuter
-    tessedit_ok_mode.set_value(0);               // turn off context checking
    // Explore all segmentations.
    getDict().stopper_no_acceptable_choices.set_value(1);
  }
@ -156,6 +151,47 @@ void Tesseract::recog_training_segmented(const STRING &fname,
          examined_words, total_words);
 }

+// Helper prints the given set of blob choices.
+static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
+                      const UNICHARSET& unicharset,
+                      const char *label, FILE *output_file) {
+  float rating = 0.0f;
+  float certainty = 0.0f;
+  for (int i = 0; i < length; ++i) {
+    const BLOB_CHOICE* blob_choice = blob_choices[i];
+    fprintf(output_file, "%s",
+           unicharset.id_to_unichar(blob_choice->unichar_id()));
+    rating += blob_choice->rating();
+    if (certainty > blob_choice->certainty())
+      certainty = blob_choice->certainty();
+  }
+  fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
+         label, rating, certainty);
+}
+
+// Helper recursively prints all paths through the ratings matrix, starting
+// at column col.
+static void PrintMatrixPaths(int col, int dim,
+                             const MATRIX& ratings,
+                             int length, const BLOB_CHOICE** blob_choices,
+                             const UNICHARSET& unicharset,
+                             const char *label, FILE *output_file) {
+  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
+    if (ratings.get(col, row) != NOT_CLASSIFIED) {
+      BLOB_CHOICE_IT bc_it(ratings.get(col, row));
+      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+        blob_choices[length] = bc_it.data();
+        if (row + 1 < dim) {
+          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
+                           unicharset, label, output_file);
+        } else {
+          PrintPath(length + 1, blob_choices, unicharset, label, output_file);
+        }
+      }
+    }
+  }
+}
+
 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
 // raw choice as a result of the classification. For words labeled with a
 // single unichar also outputs all alternatives from blob_choices of the
@ -165,44 +201,25 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
                                           BLOCK_RES *block_res,
                                           const char *label,
                                           FILE *output_file) {
-  int offset;
  // Classify word.
  fflush(stdout);
  classify_word_pass1(block_res->block, row_res->row, werd_res);
  WERD_CHOICE *best_choice = werd_res->best_choice;
  ASSERT_HOST(best_choice != NULL);
-  ASSERT_HOST(best_choice->blob_choices() != NULL);

  // Compute the number of unichars in the label.
-  int label_num_unichars = 0;
-  int step = 1;  // should be non-zero on the first iteration
-  for (offset = 0; label[offset] != '\0' && step > 0;
-       step = werd_res->uch_set->step(label + offset),
-       offset += step, ++label_num_unichars);
-  if (step == 0) {
+  GenericVector<UNICHAR_ID> encoding;
+  if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
    tprintf("Not outputting illegal unichar %s\n", label);
    return;
  }

-  // Output all classifier choices for the unigrams (1->1 classifications).
-  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
-    BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
-    outer_blob_choice_it.set_to_list(best_choice->blob_choices());
-    BLOB_CHOICE_IT blob_choice_it;
-    blob_choice_it.set_to_list(outer_blob_choice_it.data());
-    for (blob_choice_it.mark_cycle_pt();
-         !blob_choice_it.cycled_list();
-         blob_choice_it.forward()) {
-      BLOB_CHOICE *blob_choice = blob_choice_it.data();
-      if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
-        fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
-               unicharset.id_to_unichar(blob_choice->unichar_id()),
-               label, blob_choice->rating(), blob_choice->certainty());
-      }
-    }
-  }
-  // Output raw choices for many->many and 1->many classifications.
-  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
+  // Dump all paths through the ratings matrix (which is normally small).
+  int dim = werd_res->ratings->dimension();
+  const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
+  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
+                   unicharset, label, output_file);
+  delete [] blob_choices;
 }

 }  // namespace tesseract
--- a/ccmain/reject.cpp
+++ b/ccmain/reject.cpp
@ -30,13 +30,13 @@
 #include          "scanutils.h"
 #include          <ctype.h>
 #include          <string.h>
-#include          "memry.h"
+#include          "genericvector.h"
 #include          "reject.h"
-#include          "tfacep.h"
 #include          "imgs.h"
 #include          "control.h"
 #include          "docqual.h"
 #include          "secname.h"
+#include          "globaloc.h"  // For err_exit.
 #include          "globals.h"
 #include          "helpers.h"

@ -58,126 +58,26 @@ CLISTIZEH (STRING) CLISTIZE (STRING)
 *************************************************************************/

 namespace tesseract {
-void Tesseract::set_done(  //set done flag
-                         WERD_RES *word,
-                         inT16 pass) {
-  /*
-  0: Original heuristic used in Tesseract and Ray's prototype Resaljet
-  */
-  if (tessedit_ok_mode == 0) {
-    /* NOTE - done even if word contains some or all spaces !!! */
-    word->done = word->tess_accepted;
+void Tesseract::set_done(WERD_RES *word, inT16 pass) {
+  word->done = word->tess_accepted &&
+      (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
+  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
+  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+      word->best_choice->permuter() == FREQ_DAWG_PERM ||
+      word->best_choice->permuter() == USER_DAWG_PERM;
+  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
+      one_ell_conflict(word, FALSE)) {
+    if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
+    word->done = FALSE;
  }
-  /*
-  1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
-  */
-  else if (tessedit_ok_mode == 1) {
-    word->done = word->tess_accepted &&
-      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
-
-    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
+  if (word->done && ((!word_from_dict &&
+      word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
+    if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
      word->done = FALSE;
  }
-  /*
-  2: as 1 + only accept dict words or numerics in pass 1
-  */
-  else if (tessedit_ok_mode == 2) {
-    word->done = word->tess_accepted &&
-      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
-
-    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
-      word->done = FALSE;
-
-    if (word->done &&
-      (pass == 1) &&
-      (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
-      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
-      (word->best_choice->permuter () != USER_DAWG_PERM) &&
-    (word->best_choice->permuter () != NUMBER_PERM)) {
-      #ifndef SECURE_NAMES
-      if (tessedit_rejection_debug)
-        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
-          word->best_choice->unichar_string().string ());
-      #endif
-      word->done = FALSE;
-    }
-  }
-  /*
-  3: as 2 + only accept dict words or numerics in pass 2 as well
-  */
-  else if (tessedit_ok_mode == 3) {
-    word->done = word->tess_accepted &&
-      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
-
-    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
-      word->done = FALSE;
-
-    if (word->done &&
-      (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
-      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
-      (word->best_choice->permuter () != USER_DAWG_PERM) &&
-    (word->best_choice->permuter () != NUMBER_PERM)) {
-      #ifndef SECURE_NAMES
-      if (tessedit_rejection_debug)
-        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
-          word->best_choice->unichar_string().string ());
-      #endif
-      word->done = FALSE;
-    }
-  }
-  /*
-  4: as 2 + reject dict ambigs in pass 1
-  */
-  else if (tessedit_ok_mode == 4) {
-    word->done = word->tess_accepted &&
-      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
-
-    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
-      word->done = FALSE;
-
-    if (word->done &&
-      (pass == 1) &&
-      (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
-      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
-      (word->best_choice->permuter () != USER_DAWG_PERM) &&
-      (word->best_choice->permuter () != NUMBER_PERM)) ||
-    (test_ambig_word (word)))) {
-      #ifndef SECURE_NAMES
-      if (tessedit_rejection_debug)
-        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
-          word->best_choice->unichar_string().string ());
-      #endif
-      word->done = FALSE;
-    }
-  }
-  /*
-  5: as 3 + reject dict ambigs in both passes
-  */
-  else if (tessedit_ok_mode == 5) {
-    word->done = word->tess_accepted &&
-      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
-
-    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
-      word->done = FALSE;
-
-    if (word->done &&
-      (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
-      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
-      (word->best_choice->permuter () != USER_DAWG_PERM) &&
-      (word->best_choice->permuter () != NUMBER_PERM)) ||
-    (test_ambig_word (word)))) {
-      #ifndef SECURE_NAMES
-      if (tessedit_rejection_debug)
-        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
-          word->best_choice->unichar_string().string ());
-      #endif
-      word->done = FALSE;
-    }
-  }
-
-  else {
-    tprintf ("BAD tessedit_ok_mode\n");
-    err_exit();
+  if (tessedit_rejection_debug) {
+    tprintf("set_done(): done=%d\n", word->done);
+    word->best_choice->print("");
  }
 }

@ -189,12 +89,7 @@ void Tesseract::set_done(  //set done flag
 *
 * Sets a reject map for the word.
 *************************************************************************/
-void Tesseract::make_reject_map(      //make rej map for wd //detailed results
-                                WERD_RES *word,
-                                BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                ROW *row,
-                                inT16 pass  //1st or 2nd?
-                               ) {
+void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
  int i;
  int offset;

@ -208,7 +103,7 @@ void Tesseract::make_reject_map(      //make rej map for wd //detailed results
  */
  if (tessedit_reject_mode == 0) {
    if (!word->done)
-      reject_poor_matches(word, blob_choices);
+      reject_poor_matches(word);
  } else if (tessedit_reject_mode == 5) {
    /*
    5: Reject I/1/l from words where there is no strong contextual confirmation;
@ -313,45 +208,13 @@ void Tesseract::reject_I_1_L(WERD_RES *word) {
 }  // namespace tesseract


-void reject_poor_matches(  //detailed results
-                         WERD_RES *word,
-                         BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  float threshold;
-  inT16 i = 0;
-  inT16 offset = 0;
-                                 //super iterator
-  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
-  BLOB_CHOICE_IT choice_it;      //real iterator
-
-  #ifndef SECURE_NAMES
-  if (strlen(word->best_choice->unichar_lengths().string()) !=
-      list_it.length()) {
-    tprintf
-      ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
-      word->best_choice->unichar_string().string(),
-      strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
-      word->box_word->length());
-  }
-  #endif
-  ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
-    list_it.length ());
-  ASSERT_HOST(word->box_word->length() == list_it.length());
-  threshold = compute_reject_threshold (blob_choices);
-
-  for (list_it.mark_cycle_pt ();
-  !list_it.cycled_list (); list_it.forward (), i++,
-           offset += word->best_choice->unichar_lengths()[i]) {
-    /* NB - only compares the threshold against the TOP choice char in the
-      choices list for a blob !! - the selected one may be below the threshold
-    */
-    choice_it.set_to_list (list_it.data ());
-    if ((word->best_choice->unichar_string()[offset] == ' ') ||
-      (choice_it.length () == 0))
-                                 //rej unrecognised blobs
-      word->reject_map[i].setrej_tess_failure ();
-    else if (choice_it.data ()->certainty () < threshold)
-                                 //rej poor score blob
-      word->reject_map[i].setrej_poor_match ();
+void reject_poor_matches(WERD_RES *word) {
+  float threshold = compute_reject_threshold(word->best_choice);
+  for (int i = 0; i < word->best_choice->length(); ++i) {
+    if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
+      word->reject_map[i].setrej_tess_failure();
+    else if (word->best_choice->certainty(i) < threshold)
+      word->reject_map[i].setrej_poor_match();
  }
 }

@ -364,52 +227,32 @@ void reject_poor_matches(  //detailed results
 * gap in the certainty value.
 **********************************************************************/

-float compute_reject_threshold(  //compute threshold //detailed results
-                               BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  inT16 index;                   //to ratings
-  inT16 blob_count;              //no of blobs in word
-  inT16 ok_blob_count = 0;       //non TESS rej blobs in word
-  float *ratings;                //array of confidences
-  float threshold;               //rejection threshold
-  float bestgap;                 //biggest gap
-  float gapstart;                //bottom of gap
-                                 //super iterator
-  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
-  BLOB_CHOICE_IT choice_it;      //real iterator
+float compute_reject_threshold(WERD_CHOICE* word) {
+  float threshold;               // rejection threshold
+  float bestgap = 0.0f;          // biggest gap
+  float gapstart;                // bottom of gap
+                                 // super iterator
+  BLOB_CHOICE_IT choice_it;      // real iterator

-  blob_count = blob_choices->length ();
-  ratings = (float *) alloc_mem (blob_count * sizeof (float));
-  for (list_it.mark_cycle_pt (), index = 0;
-  !list_it.cycled_list (); list_it.forward (), index++) {
-    choice_it.set_to_list (list_it.data ());
-    if (choice_it.length () > 0) {
-      ratings[ok_blob_count] = choice_it.data ()->certainty ();
-      //get in an array
-      //                 tprintf("Rating[%d]=%c %g %g\n",
-      //                         index,choice_it.data()->char_class(),
-      //                         choice_it.data()->rating(),choice_it.data()->certainty());
-      ok_blob_count++;
-    }
+  int blob_count = word->length();
+  GenericVector<float> ratings;
+  ratings.init_to_size(blob_count, 0.0f);
+  for (int i = 0; i < blob_count; ++i) {
+    ratings[i] = word->certainty(i);
  }
-  ASSERT_HOST (index == blob_count);
-  qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
-  //sort them
-  bestgap = 0;
-  gapstart = ratings[0] - 1;     //all reject if none better
-  if (ok_blob_count >= 3) {
-    for (index = 0; index < ok_blob_count - 1; index++) {
+  ratings.sort();
+  gapstart = ratings[0] - 1;     // all reject if none better
+  if (blob_count >= 3) {
+    for (int index = 0; index < blob_count - 1; index++) {
      if (ratings[index + 1] - ratings[index] > bestgap) {
        bestgap = ratings[index + 1] - ratings[index];
-        //find biggest
+        // find biggest
        gapstart = ratings[index];
      }
    }
  }
  threshold = gapstart + bestgap / 2;
-  //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
-  //              ratings[0],ratings[index],bestgap,threshold);

-  free_mem(ratings);
  return threshold;
 }

@ -680,21 +523,6 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
  return FALSE;
 }

-
-BOOL8 Tesseract::test_ambig_word(  //test for ambiguity
-                                 WERD_RES *word) {
-    BOOL8 ambig = FALSE;
-
-    if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
-      (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
-    (word->best_choice->permuter () == USER_DAWG_PERM)) {
-      ambig = !getDict().NoDangerousAmbig(
-          word->best_choice, NULL, false, NULL, NULL);
-  }
-  return ambig;
-}
-
-
 /*************************************************************************
 * dont_allow_1Il()
 * Dont unreject LONE accepted 1Il conflict set chars
@ -786,10 +614,9 @@ inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
 }

-// Note: After running this function word_res->best_choice->blob_choices()
-// might not contain the right BLOB_CHOICE coresponding to each character
-// in word_res->best_choice. However, the length of blob_choices and
-// word_res->best_choice will remain the same.
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
 void Tesseract::flip_hyphens(WERD_RES *word_res) {
  WERD_CHOICE *best_choice = word_res->best_choice;
  int i;
@ -801,16 +628,16 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
  if (tessedit_lower_flip_hyphen <= 1)
    return;

-  TBLOB* blob = word_res->rebuild_word->blobs;
+  int num_blobs = word_res->rebuild_word->NumBlobs();
  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
  bool modified = false;
-  for (i = 0; i < best_choice->length() && blob != NULL; ++i,
-       blob = blob->next) {
+  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+    TBLOB* blob = word_res->rebuild_word->blobs[i];
    out_box = blob->bounding_box();
-    if (blob->next == NULL)
+    if (i + 1 == num_blobs)
      next_left = 9999;
    else
-      next_left = blob->next->bounding_box().left();
+      next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
    // Dont touch small or touching blobs - it is too dangerous.
    if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
        (out_box.left() > prev_right) && (out_box.right() < next_left)) {
@ -846,10 +673,9 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
  }
 }

-// Note: After running this function word_res->best_choice->blob_choices()
-// might not contain the right BLOB_CHOICE coresponding to each character
-// in word_res->best_choice. However, the length of blob_choices and
-// word_res->best_choice will remain the same.
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
 void Tesseract::flip_0O(WERD_RES *word_res) {
  WERD_CHOICE *best_choice = word_res->best_choice;
  int i;
@ -858,9 +684,9 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
  if (!tessedit_flip_0O)
    return;

-  TBLOB* blob = word_res->rebuild_word->blobs;
-  for (i = 0; i < best_choice->length() && blob != NULL; ++i,
-       blob = blob->next) {
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+    TBLOB* blob = word_res->rebuild_word->blobs[i];
    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
      out_box = blob->bounding_box();
--- a/ccmain/reject.h
+++ b/ccmain/reject.h
@ -24,8 +24,8 @@
 #include          "pageres.h"

 void reject_blanks(WERD_RES *word);
-void reject_poor_matches(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices);
-float compute_reject_threshold(BLOB_CHOICE_LIST_CLIST *blob_choices);
+void reject_poor_matches(WERD_RES *word);
+float compute_reject_threshold(WERD_CHOICE* word);
 BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths);
 void dont_allow_1Il(WERD_RES *word);
 void flip_hyphens(WERD_RES *word);
--- a/ccmain/resultiterator.h
+++ b/ccmain/resultiterator.h
@ -24,8 +24,9 @@

 #include "platform.h"
 #include "ltrresultiterator.h"
-#include "genericvector.h"

+template <typename T> class GenericVector;
+template <typename T> class GenericVectorEqEq;
 class BLOB_CHOICE_IT;
 class WERD_RES;
 class STRING;
--- a/ccmain/scaleimg.cpp
+++ b/ccmain/scaleimg.cpp
@ -31,6 +31,7 @@
 #include          <stdlib.h>
 #include          <string.h>
 #include          "fileerr.h"
+#include          "globaloc.h"  // For err_exit.
 #include          "tprintf.h"
 #include          "img.h"
 #include          "imgscale.h"
--- a/ccmain/tessbox.cpp
+++ b/ccmain/tessbox.cpp
@ -21,25 +21,22 @@
 #pragma warning(disable:4244)  // Conversion warnings
 #endif

-#include          "tfacep.h"
-#include          "tfacepp.h"
-#include          "tessbox.h"
 #include "mfoutline.h"
+#include "tessbox.h"
 #include "tesseractclass.h"

 #define EXTERN

 /**
- * @name tess_segment_pass1
+ * @name tess_segment_pass_n
 *
- * Segment a word using the pass1 conditions of the tess segmenter.
+ * Segment a word using the pass_n conditions of the tess segmenter.
+ * @param pass_n pass number
 * @param word word to do
- * @param blob_choices list of blob lists
 */

 namespace tesseract {
-void Tesseract::tess_segment_pass1(WERD_RES *word,
-                                   BLOB_CHOICE_LIST_CLIST *blob_choices) {
+void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
  int saved_enable_assoc = 0;
  int saved_chop_enable = 0;

@ -48,46 +45,17 @@ void Tesseract::tess_segment_pass1(WERD_RES *word,
    saved_chop_enable = chop_enable;
    wordrec_enable_assoc.set_value(0);
    chop_enable.set_value(0);
-    if (word->word->flag(W_REP_CHAR))
-      getDict().permute_only_top.set_value(true);
  }
-  set_pass1();
-  recog_word(word, blob_choices);
+  if (pass_n == 1)
+    set_pass1();
+  else
+    set_pass2();
+  recog_word(word);
+  if (word->best_choice == NULL)
+    word->SetupFake(*word->uch_set);
  if (word->word->flag(W_DONT_CHOP)) {
    wordrec_enable_assoc.set_value(saved_enable_assoc);
    chop_enable.set_value(saved_chop_enable);
-    getDict().permute_only_top.set_value(false);
-  }
-}
-
-
-/**
- * @name tess_segment_pass2
- *
- * Segment a word using the pass2 conditions of the tess segmenter.
- * @param word word to do
- * @param blob_choices list of blob lists
- */
-
-void Tesseract::tess_segment_pass2(WERD_RES *word,
-                                   BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  int saved_enable_assoc = 0;
-  int saved_chop_enable = 0;
-
-  if (word->word->flag(W_DONT_CHOP)) {
-    saved_enable_assoc = wordrec_enable_assoc;
-    saved_chop_enable = chop_enable;
-    wordrec_enable_assoc.set_value(0);
-    chop_enable.set_value(0);
-    if (word->word->flag(W_REP_CHAR))
-      getDict().permute_only_top.set_value(true);
-  }
-  set_pass2();
-  recog_word(word, blob_choices);
-  if (word->word->flag(W_DONT_CHOP)) {
-    wordrec_enable_assoc.set_value(saved_enable_assoc);
-    chop_enable.set_value(saved_chop_enable);
-    getDict().permute_only_top.set_value(false);
  }
 }

@ -98,10 +66,8 @@ void Tesseract::tess_segment_pass2(WERD_RES *word,
 * @param word_choice after context
 * @param raw_choice before context
 */
-BOOL8 Tesseract::tess_acceptable_word(
-    WERD_CHOICE *word_choice,  // after context
-    WERD_CHOICE *raw_choice) {  // before context
-  return getDict().AcceptableResult(*word_choice);
+bool Tesseract::tess_acceptable_word(WERD_RES* word) {
+  return getDict().AcceptableResult(word);
 }


--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@ -17,30 +17,17 @@
 *
 **********************************************************************/

-//#include                                                      <osfcn.h>
-//#include                                                      <signal.h>
-//#include                                                      <time.h>
-//#include                                                      <unistd.h>
-#include          "tfacep.h"     //must be before main.h
-//#include                                                      "fileerr.h"
 #include          "stderr.h"
 #include          "basedir.h"
 #include          "tessvars.h"
-//#include                                                      "debgwin.h"
-//#include                                      "epapdest.h"
 #include          "control.h"
 #include          "imgs.h"
 #include          "reject.h"
 #include          "pageres.h"
-//#include                                                      "gpapdest.h"
 #include          "nwmain.h"
 #include          "pgedit.h"
 #include          "tprintf.h"
-//#include                                      "ipeerr.h"
-//#include                                                      "restart.h"
 #include          "tessedit.h"
-//#include                                                      "fontfind.h"
-#include "permute.h"
 #include "stopper.h"
 #include "intmatcher.h"
 #include "chop.h"
@ -190,9 +177,16 @@ bool Tesseract::init_tesseract_lang_data(
  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
  right_to_left_ = unicharset.major_right_to_left();

+  // Setup initial unichar ambigs table and read universal ambigs.
+  UNICHARSET encoder_unicharset;
+  encoder_unicharset.CopyFrom(unicharset);
+  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
+  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
+
  if (!tessedit_ambigs_training &&
      tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
    unichar_ambigs.LoadUnicharAmbigs(
+        encoder_unicharset,
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
        ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
@ -210,6 +204,23 @@ bool Tesseract::init_tesseract_lang_data(
      tprintf("Loaded Cube with combiner\n");
  }

+  // Init ParamsModel.
+  // Load pass1 and pass2 weights (for now these two sets are the same, but in
+  // the future separate sets of weights can be generated).
+  for (int p = ParamsModel::PTRAIN_PASS1;
+      p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
+    language_model_->getParamsModel().SetPass(
+        static_cast<ParamsModel::PassEnum>(p));
+    if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
+      if (!language_model_->getParamsModel().LoadFromFp(
+          lang.string(), tessdata_manager.GetDataFilePtr(),
+          tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
+        return false;
+      }
+    }
+  }
+  if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
+
  return true;
 }

@ -323,6 +334,30 @@ int Tesseract::init_tesseract(
    tprintf("Tesseract couldn't load any languages!\n");
    return -1;  // Couldn't load any language!
  }
+  if (!sub_langs_.empty()) {
+    // In multilingual mode word ratings have to be directly comparable,
+    // so use the same language model weights for all languages:
+    // use the primary language's params model if
+    // tessedit_use_primary_params_model is set,
+    // otherwise use default language model weights.
+    if (tessedit_use_primary_params_model) {
+      for (int s = 0; s < sub_langs_.size(); ++s) {
+        sub_langs_[s]->language_model_->getParamsModel().Copy(
+            this->language_model_->getParamsModel());
+      }
+      tprintf("Using params model of the primary language\n");
+      if (tessdata_manager_debug_level)  {
+        this->language_model_->getParamsModel().Print();
+      }
+    } else {
+      this->language_model_->getParamsModel().Clear();
+      for (int s = 0; s < sub_langs_.size(); ++s) {
+        sub_langs_[s]->language_model_->getParamsModel().Clear();
+      }
+      tprintf("Using default language params\n");
+    }
+  }
+
  SetupUniversalFontIds();
  return 0;
 }
@ -420,7 +455,7 @@ int Tesseract::init_tesseract_lm(const char *arg0,
  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
                                NULL, 0, NULL, NULL, false))
    return -1;
-  getDict().Load();
+  getDict().Load(Dict::GlobalDawgCache());
  tessdata_manager.End();
  return 0;
 }
--- a/ccmain/tesseract_cube_combiner.cpp
+++ b/ccmain/tesseract_cube_combiner.cpp
@ -221,16 +221,16 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
    features->push_back(cube_best_bigram_cost);
  }
  // case-insensitive string comparison, including punctuation
-  int compare_nocase_punc = CompareStrings(cube_best_str.c_str(),
-                                           tess_str.c_str(), false, true);
+  int compare_nocase_punc = CompareStrings(cube_best_str,
+                                           tess_str, false, true);
  features->push_back(compare_nocase_punc == 0);
  // case-sensitive string comparison, ignoring punctuation
-  int compare_case_nopunc = CompareStrings(cube_best_str.c_str(),
-                                           tess_str.c_str(), true, false);
+  int compare_case_nopunc = CompareStrings(cube_best_str,
+                                           tess_str, true, false);
  features->push_back(compare_case_nopunc == 0);
  // case-insensitive string comparison, ignoring punctuation
-  int compare_nocase_nopunc = CompareStrings(cube_best_str.c_str(),
-                                             tess_str.c_str(), true, true);
+  int compare_nocase_nopunc = CompareStrings(cube_best_str,
+                                             tess_str, true, true);
  features->push_back(compare_nocase_nopunc == 0);
  return true;
 }
--- a/ccmain/tfacep.h
+++ b/ccmain/tfacep.h
@ -1,37 +0,0 @@
-/**********************************************************************
- * File:        tfacep.h  (Formerly tfacep.h)
- * Description: Declarations of C functions and C owned data.
- * Author:					Ray Smith
- * Created:					Mon Apr 27 12:51:28 BST 1992
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef TFACEP_H
-#define TFACEP_H
-
-#include "host.h"
-#include "blobs.h"
-#include "tessarray.h"
-#include "oldlist.h"
-#include "permute.h"
-#include "blobclass.h"
-#include "stopper.h"
-#include "associate.h"
-#include "chop.h"
-#include "structures.h"
-
-typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, inT32, LIST);
-typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *);
-
-#endif
--- a/ccmain/tfacepp.cpp
+++ b/ccmain/tfacepp.cpp
@ -25,19 +25,12 @@

 #include <math.h>

-#ifdef __UNIX__
-#include          <assert.h>
-#endif
-#include          "errcode.h"
-#include          "ratngs.h"
-#include          "reject.h"
-#include          "werd.h"
-#include          "tfacep.h"
-#include          "tfacepp.h"
-#include          "tessvars.h"
-#include          "globals.h"
-#include          "reject.h"
-#include          "tesseractclass.h"
+#include "blamer.h"
+#include "errcode.h"
+#include "ratngs.h"
+#include "reject.h"
+#include "tesseractclass.h"
+#include "werd.h"

 #define MAX_UNDIVIDED_LENGTH 24

@ -50,21 +43,30 @@
 * Convert the output back to editor form.
 **********************************************************************/
 namespace tesseract {
-void Tesseract::recog_word(WERD_RES *word,
-                           BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  ASSERT_HOST(word->chopped_word->blobs != NULL);
-  recog_word_recursive(word, blob_choices);
+void Tesseract::recog_word(WERD_RES *word) {
+  if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
+      word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
+    if (classify_debug_level) tprintf("No truth for word - skipping\n");
+    word->tess_failed = true;
+    return;
+  }
+  ASSERT_HOST(!word->chopped_word->blobs.empty());
+  recog_word_recursive(word);
  word->SetupBoxWord();
-  if ((word->best_choice->length() != word->box_word->length()) ||
-      (word->best_choice->length() != blob_choices->length())) {
+  if (word->best_choice->length() != word->box_word->length()) {
    tprintf("recog_word ASSERT FAIL String:\"%s\"; "
-            "Strlen=%d; #Blobs=%d; #Choices=%d\n",
+            "Strlen=%d; #Blobs=%d\n",
            word->best_choice->debug_string().string(),
-            word->best_choice->length(), word->box_word->length(),
-            blob_choices->length());
+            word->best_choice->length(), word->box_word->length());
  }
  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
-  ASSERT_HOST(word->best_choice->length() == blob_choices->length());
+  // Check that the ratings matrix size matches the sum of all the
+  // segmentation states.
+  if (!word->StatesAllValid()) {
+    tprintf("Not all words have valid states relative to ratings matrix!!");
+    word->DebugWordChoices(true, NULL);
+    ASSERT_HOST(word->StatesAllValid());
+  }
  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    uinT8 perm_type = word->best_choice->permuter();
@ -105,31 +107,13 @@ void Tesseract::recog_word(WERD_RES *word,
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
-void Tesseract::recog_word_recursive(WERD_RES *word,
-                                     BLOB_CHOICE_LIST_CLIST *blob_choices) {
+void Tesseract::recog_word_recursive(WERD_RES *word) {
  int word_length = word->chopped_word->NumBlobs();  // no of blobs
  if (word_length > MAX_UNDIVIDED_LENGTH) {
-    return split_and_recog_word(word, blob_choices);
+    return split_and_recog_word(word);
  }
-  int initial_blob_choice_len = blob_choices->length();
-  BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
-
-  // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
-  BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
-  for (int i = 0; i < tess_ratings->length(); ++i) {
-    blob_choices_it.add_to_end(tess_ratings->get(i));
-  }
-  delete tess_ratings;
-
+  cc_recog(word);
  word_length = word->rebuild_word->NumBlobs();  // No of blobs in output.
-  // Pad raw_choice with spaces if needed.
-  if (word->raw_choice->length() < word_length) {
-    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
-    while (word->raw_choice->length() < word_length) {
-      word->raw_choice->append_unichar_id(space_id, 1, 0.0,
-                                          word->raw_choice->certainty());
-    }
-  }

  // Do sanity checks and minor fixes on best_choice.
  if (word->best_choice->length() > word_length) {
@ -141,21 +125,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
    tprintf("Word is at:");
    word->word->bounding_box().print();
  }
-  if (blob_choices->length() - initial_blob_choice_len != word_length) {
-    word->best_choice->make_bad();  // force rejection
-    tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
-            blob_choices->length(), word_length);
-    blob_choices_it.set_to_list(blob_choices);  // list of lists
-    while (blob_choices->length() - initial_blob_choice_len < word_length) {
-      blob_choices_it.add_to_end(new BLOB_CHOICE_LIST());  // add a fake one
-      tprintf("recog_word: Added dummy choice list\n");
-    }
-    while (blob_choices->length() - initial_blob_choice_len > word_length) {
-      blob_choices_it.move_to_last(); // should never happen
-      delete blob_choices_it.extract();
-      tprintf("recog_word: Deleted choice list\n");
-    }
-  }
  if (word->best_choice->length() < word_length) {
    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
    while (word->best_choice->length() < word_length) {
@ -172,133 +141,134 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
 * Split the word into 2 smaller pieces at the largest gap.
 * Recognize the pieces and stick the results back together.
 **********************************************************************/
-
-void Tesseract::split_and_recog_word(WERD_RES *word,
-                                     BLOB_CHOICE_LIST_CLIST *blob_choices) {
+void Tesseract::split_and_recog_word(WERD_RES *word) {
  // Find the biggest blob gap in the chopped_word.
  int bestgap = -MAX_INT32;
-  TPOINT best_split_pt;
+  int split_index = 0;
  TBLOB* best_end = NULL;
  TBLOB* prev_blob = NULL;
-  for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
-       blob = blob->next) {
-    if (prev_blob != NULL) {
-      TBOX prev_box = prev_blob->bounding_box();
-      TBOX blob_box = blob->bounding_box();
-      int gap = blob_box.left() - prev_box.right();
-      if (gap > bestgap) {
-        bestgap = gap;
-        best_end = prev_blob;
-        best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
-        best_split_pt.y = (prev_box.top() + prev_box.bottom() +
-                           blob_box.top() + blob_box.bottom()) / 4;
-      }
+  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
+    TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
+    TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
+    int gap = blob_box.left() - prev_box.right();
+    if (gap > bestgap) {
+      bestgap = gap;
+      split_index = b;
    }
-    prev_blob = blob;
  }
-  ASSERT_HOST(best_end != NULL);
-  ASSERT_HOST(best_end->next != NULL);
+  ASSERT_HOST(split_index > 0);

-  // Make a copy of the word to put the 2nd half in.
-  WERD_RES* word2 = new WERD_RES(*word);
-  // Blow away the copied chopped_word, as we want to work with the blobs
-  // from the input chopped_word so the seam_arrays can be merged.
-  delete word2->chopped_word;
-  word2->chopped_word = new TWERD;
-  word2->chopped_word->blobs = best_end->next;
-  best_end->next = NULL;
-  // Make a new seamarray on both words.
-  free_seam_list(word->seam_array);
-  word->seam_array = start_seam_list(word->chopped_word->blobs);
-  word2->seam_array = start_seam_list(word2->chopped_word->blobs);
-  BlamerBundle *orig_bb = word->blamer_bundle;
-  STRING blamer_debug;
-  // Try to adjust truth information.
-  if (orig_bb != NULL) {
-    // Find truth boxes that correspond to the split in the blobs.
-    int b;
-    int begin2_truth_index = -1;
-    if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
-        orig_bb->truth_has_char_boxes) {
-      int end1_x = best_end->bounding_box().right();
-      int begin2_x = word2->chopped_word->blobs->bounding_box().left();
-      blamer_debug = "Looking for truth split at";
-      blamer_debug.add_str_int(" end1_x ", end1_x);
-      blamer_debug.add_str_int(" begin2_x ", begin2_x);
-      blamer_debug += "\nnorm_truth_word boxes:\n";
-      if (orig_bb->norm_truth_word.length() > 1) {
-        orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
-        for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
-          orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
-          if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
-              orig_bb->norm_box_tolerance) &&
-              (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
-              orig_bb->norm_box_tolerance)) {
-            begin2_truth_index = b;
-            blamer_debug += "Split found\n";
-            break;
-          }
-        }
-      }
-    }
-    // Populate truth information in word and word2 with the first and second
-    // part of the original truth.
-    word->blamer_bundle = new BlamerBundle();
-    word2->blamer_bundle = new BlamerBundle();
-    if (begin2_truth_index > 0) {
-      word->blamer_bundle->truth_has_char_boxes = true;
-      word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
-      word2->blamer_bundle->truth_has_char_boxes = true;
-      word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
-      BlamerBundle *curr_bb = word->blamer_bundle;
-      for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
-        if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
-        curr_bb->norm_truth_word.InsertBox(
-            b, orig_bb->norm_truth_word.BlobBox(b));
-        curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
-        curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
-      }
-    } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
-      word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
-      word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
-    } else {
-      blamer_debug += "Truth split not found";
-      blamer_debug += orig_bb->truth_has_char_boxes ?
-          "\n" : " (no truth char boxes)\n";
-      word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
-                                    NULL, wordrec_debug_blamer);
-      word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
-                                     NULL, wordrec_debug_blamer);
-    }
-  }
+  WERD_RES *word2 = NULL;
+  BlamerBundle *orig_bb = NULL;
+  split_word(word, split_index, &word2, &orig_bb);

  // Recognize the first part of the word.
-  recog_word_recursive(word, blob_choices);
+  recog_word_recursive(word);
  // Recognize the second part of the word.
-  recog_word_recursive(word2, blob_choices);
+  recog_word_recursive(word2);
+
+  join_words(word, word2, orig_bb);
+}
+
+
+/**********************************************************************
+ * split_word
+ *
+ * Split a given WERD_RES in place into two smaller words for recognition.
+ * split_pt is the index of the first blob to go in the second word.
+ * The underlying word is left alone, only the TWERD (and subsequent data)
+ * are split up.  orig_blamer_bundle is set to the original blamer bundle,
+ * and will now be owned by the caller.  New blamer bundles are forged for the
+ * two pieces.
+ **********************************************************************/
+void Tesseract::split_word(WERD_RES *word,
+                           int split_pt,
+                           WERD_RES **right_piece,
+                           BlamerBundle **orig_blamer_bundle) const {
+  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
+
+  // Save a copy of the blamer bundle so we can try to reconstruct it below.
+  BlamerBundle *orig_bb =
+      word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
+
+  WERD_RES *word2 = new WERD_RES(*word);
+
+  // blow away the copied chopped_word, as we want to work with
+  // the blobs from the input chopped_word so seam_arrays can be merged.
+  TWERD *chopped = word->chopped_word;
+  TWERD *chopped2 = new TWERD;
+  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
+  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
+    chopped2->blobs.push_back(chopped->blobs[i]);
+  }
+  chopped->blobs.truncate(split_pt);
+  word->chopped_word = NULL;
+  delete word2->chopped_word;
+  word2->chopped_word = NULL;
+
+  const UNICHARSET &unicharset = *word->uch_set;
+  word->ClearResults();
+  word2->ClearResults();
+  word->chopped_word = chopped;
+  word2->chopped_word = chopped2;
+  word->SetupBasicsFromChoppedWord(unicharset);
+  word2->SetupBasicsFromChoppedWord(unicharset);
+
+  // Try to adjust the blamer bundle.
+  if (orig_bb != NULL) {
+    // TODO(rays) Looks like a leak to me.
+    // orig_bb should take, rather than copy.
+    word->blamer_bundle = new BlamerBundle();
+    word2->blamer_bundle = new BlamerBundle();
+    orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
+                         word2->chopped_word->blobs[0]->bounding_box().left(),
+                         wordrec_debug_blamer,
+                         word->blamer_bundle, word2->blamer_bundle);
+  }
+
+  *right_piece = word2;
+  *orig_blamer_bundle = orig_bb;
+}
+
+
+/**********************************************************************
+ * join_words
+ *
+ * The opposite of split_word():
+ *  join word2 (including any recognized data / seam array / etc)
+ *  onto the right of word and then delete word2.
+ *  Also, if orig_bb is provided, stitch it back into word.
+ **********************************************************************/
+void Tesseract::join_words(WERD_RES *word,
+                           WERD_RES *word2,
+                           BlamerBundle *orig_bb) const {
+  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
+  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
  // Tack the word2 outputs onto the end of the word outputs.
-  // New blobs might have appeared on the end of word1.
-  for (best_end = word->chopped_word->blobs; best_end->next != NULL;
-       best_end = best_end->next);
-  best_end->next = word2->chopped_word->blobs;
-  TBLOB* blob;
-  for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
-  blob->next = word2->rebuild_word->blobs;
-  word2->chopped_word->blobs = NULL;
-  word2->rebuild_word->blobs = NULL;
-  // Copy the seams onto the end of the word1 seam_array.
+  word->chopped_word->blobs += word2->chopped_word->blobs;
+  word->rebuild_word->blobs += word2->rebuild_word->blobs;
+  word2->chopped_word->blobs.clear();
+  word2->rebuild_word->blobs.clear();
+  TPOINT split_pt;
+  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
+  split_pt.y = (prev_box.top() + prev_box.bottom() +
+                blob_box.top() + blob_box.bottom()) / 4;
+  // Move the word2 seams onto the end of the word1 seam_array.
  // Since the seam list is one element short, an empty seam marking the
  // end of the last blob in the first word is needed first.
-  word->seam_array = add_seam(word->seam_array,
-                              new_seam(0.0, best_split_pt, NULL, NULL, NULL));
-  for (int i = 0; i < array_count(word2->seam_array); ++i) {
-    SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
-    array_value(word2->seam_array, i) = NULL;
-    word->seam_array = add_seam(word->seam_array, seam);
-  }
+  word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL));
+  word->seam_array += word2->seam_array;
+  word2->seam_array.truncate(0);
+  // Fix widths and gaps.
+  word->blob_widths += word2->blob_widths;
+  word->blob_gaps += word2->blob_gaps;
+  // Fix the ratings matrix.
+  int rat1 = word->ratings->dimension();
+  int rat2 = word2->ratings->dimension();
+  word->ratings->AttachOnCorner(word2->ratings);
+  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
  word->best_state += word2->best_state;
  // Append the word choices.
-  *word->best_choice += *word2->best_choice;
  *word->raw_choice += *word2->raw_choice;

  // How many alt choices from each should we try to get?
@ -306,70 +276,56 @@ void Tesseract::split_and_recog_word(WERD_RES *word,
  // When do we start throwing away extra alt choices?
  const int kTooManyAltChoices = 100;

-  if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
-    // Construct the cartesian product of the alt choices of word(1) and word2.
-    int num_first_alt_choices = word->alt_choices.size();
-    // Nota Bene: For the main loop here, we leave in place word1-only
-    // alt_choices in
-    //   word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
-    // These will get fused with the best choices for word2 below.
-    for (int j = 1; j < word2->alt_choices.size() &&
-         (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
-         j++) {
-      for (int i = 0; i < num_first_alt_choices &&
-           (i <= kAltsPerPiece ||
-            word->alt_choices.size() < kTooManyAltChoices);
-           i++) {
-        WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
-        *wc += *word2->alt_choices[j];
-        word->alt_choices.push_back(wc);
-
-        word->alt_states.push_back(GenericVector<int>());
-        GenericVector<int> &alt_state = word->alt_states.back();
-        alt_state += word->alt_states[i];
-        alt_state += word2->alt_states[j];
-      }
-    }
-    // Now that we've filled in as many alternates as we want, paste the best
-    // choice for word2 onto the original word alt_choices.
-    for (int i = 0; i < num_first_alt_choices; i++) {
-      *word->alt_choices[i] += *word2->alt_choices[0];
-      word->alt_states[i] += word2->alt_states[0];
+  // Construct the cartesian product of the best_choices of word(1) and word2.
+  WERD_CHOICE_LIST joined_choices;
+  WERD_CHOICE_IT jc_it(&joined_choices);
+  WERD_CHOICE_IT bc1_it(&word->best_choices);
+  WERD_CHOICE_IT bc2_it(&word2->best_choices);
+  int num_word1_choices = word->best_choices.length();
+  int total_joined_choices = num_word1_choices;
+  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
+  // word2 choices, and put them in the joined_choices list. The 1st word2
+  // choice gets added to the original word1 choices in-place after we have
+  // finished with them.
+  int bc2_index = 1;
+  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
+    if (total_joined_choices >= kTooManyAltChoices &&
+        bc2_index > kAltsPerPiece)
+      break;
+    int bc1_index = 0;
+    for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
+        ++bc1_index, bc1_it.forward()) {
+      if (total_joined_choices >= kTooManyAltChoices &&
+          bc1_index > kAltsPerPiece)
+        break;
+      WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
+      *wc += *bc2_it.data();
+      jc_it.add_after_then_move(wc);
+      ++total_joined_choices;
    }
  }
+  // Now that we've filled in as many alternates as we want, paste the best
+  // choice for word2 onto the original word alt_choices.
+  bc1_it.move_to_first();
+  bc2_it.move_to_first();
+  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
+    *bc1_it.data() += *bc2_it.data();
+  }
+  bc1_it.move_to_last();
+  bc1_it.add_list_after(&joined_choices);

  // Restore the pointer to original blamer bundle and combine blamer
  // information recorded in the splits.
  if (orig_bb != NULL) {
-    IncorrectResultReason irr = orig_bb->incorrect_result_reason;
-    if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
-    if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
-        word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
-        word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
-      blamer_debug += "Blame from part 1: ";
-      blamer_debug += word->blamer_bundle->debug;
-      irr = word->blamer_bundle->incorrect_result_reason;
-    }
-    if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
-        word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
-        word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
-      blamer_debug += "Blame from part 2: ";
-      blamer_debug += word2->blamer_bundle->debug;
-      if (irr == IRR_CORRECT) {
-        irr = word2->blamer_bundle->incorrect_result_reason;
-      } else if (irr != word2->blamer_bundle->incorrect_result_reason) {
-        irr = IRR_UNKNOWN;
-      }
-    }
+    orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
+                        wordrec_debug_blamer);
    delete word->blamer_bundle;
    word->blamer_bundle = orig_bb;
-    word->blamer_bundle->incorrect_result_reason = irr;
-    if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
-      word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
-                                    wordrec_debug_blamer);
-    }
  }
+  word->SetupBoxWord();
+  word->reject_map.initialise(word->box_word->length());
  delete word2;
 }

+
 }  // namespace tesseract
--- a/ccmain/tfacepp.h
+++ b/ccmain/tfacepp.h
@ -1,41 +0,0 @@
-/**********************************************************************
- * File:        tfacepp.h  (Formerly tface++.h)
- * Description: C++ side of the C/C++ Tess/Editor interface.
- * Author:					Ray Smith
- * Created:					Thu Apr 23 15:39:23 BST 1992
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           TFACEPP_H
-#define           TFACEPP_H
-
-#include          "ratngs.h"
-#include          "blobs.h"
-#include          "tesseractclass.h"
-
-void call_tester(                     //call a tester
-                 TBLOB *tessblob,     //blob to test
-                 BOOL8 correct_blob,  //true if good
-                 char *text,          //source text
-                 inT32 count,         //chars in text
-                 LIST result          //output of matcher
-                );
-void call_train_tester(                     //call a tester
-                       TBLOB *tessblob,     //blob to test
-                       BOOL8 correct_blob,  //true if good
-                       char *text,          //source text
-                       inT32 count,         //chars in text
-                       LIST result          //output of matcher
-                      );
-#endif
--- a/ccmain/werdit.cpp
+++ b/ccmain/werdit.cpp
@ -27,7 +27,7 @@
 **********************************************************************/

 WERD *make_pseudo_word(PAGE_RES* page_res,  // Blocks to check.
-                       TBOX &selection_box,
+                       const TBOX &selection_box,
                       BLOCK *&pseudo_block,
                       ROW *&pseudo_row) {      // Row of selection.
  PAGE_RES_IT pr_it(page_res);
--- a/ccmain/werdit.h
+++ b/ccmain/werdit.h
@ -23,7 +23,7 @@
 #include          "pageres.h"

 WERD *make_pseudo_word(PAGE_RES* page_res,  // blocks to check
-                       TBOX &selection_box,
+                       const TBOX &selection_box,
                       BLOCK *&pseudo_block,
                       ROW *&pseudo_row);

--- a/ccstruct/Makefile.am
+++ b/ccstruct/Makefile.am
@ -9,7 +9,7 @@ endif

 include_HEADERS = publictypes.h
 noinst_HEADERS = \
-    blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
+    blamer.h blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
    detlinefit.h dppoint.h fontinfo.h genblob.h hpdsizes.h ipoints.h \
    linlsq.h matrix.h mod128.h normalis.h \
    ocrblock.h ocrpara.h ocrrow.h otsuthr.h \
@ -31,12 +31,12 @@ libtesseract_ccstruct_la_LIBADD = \
 endif

 libtesseract_ccstruct_la_SOURCES = \
-    blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
+    blamer.cpp blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
    detlinefit.cpp dppoint.cpp fontinfo.cpp genblob.cpp \
    linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \
    ocrblock.cpp ocrpara.cpp ocrrow.cpp otsuthr.cpp \
    pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \
-    publictypes.cpp \
+    params_training_featdef.cpp publictypes.cpp \
    quadlsq.cpp quadratc.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \
    seam.cpp split.cpp statistc.cpp stepblob.cpp \
    vecfuncs.cpp werd.cpp
--- a/ccstruct/blamer.cpp
+++ b/ccstruct/blamer.cpp
@ -0,0 +1,587 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blamer.cpp
+// Description: Module allowing precise error causes to be allocated.
+// Author:      Rike Antonova
+// Refactored:  Ray Smith
+// Created:     Mon Feb 04 14:37:01 PST 2013
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "blamer.h"
+#include "blobs.h"
+#include "matrix.h"
+#include "normalis.h"
+#include "pageres.h"
+
+// Names for each value of IncorrectResultReason enum. Keep in sync.
+const char kBlameCorrect[] = "corr";
+const char kBlameClassifier[] = "cl";
+const char kBlameChopper[] = "chop";
+const char kBlameClassLMTradeoff[] = "cl/LM";
+const char kBlamePageLayout[] = "pglt";
+const char kBlameSegsearchHeur[] = "ss_heur";
+const char kBlameSegsearchPP[] = "ss_pp";
+const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
+const char kBlameAdaption[] = "adapt";
+const char kBlameNoTruthSplit[] = "no_tr_spl";
+const char kBlameNoTruth[] = "no_tr";
+const char kBlameUnknown[] = "unkn";
+
+const char * const kIncorrectResultReasonNames[] = {
+    kBlameCorrect,
+    kBlameClassifier,
+    kBlameChopper,
+    kBlameClassLMTradeoff,
+    kBlamePageLayout,
+    kBlameSegsearchHeur,
+    kBlameSegsearchPP,
+    kBlameClassOldLMTradeoff,
+    kBlameAdaption,
+    kBlameNoTruthSplit,
+    kBlameNoTruth,
+    kBlameUnknown
+};
+
+const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
+  return kIncorrectResultReasonNames[irr];
+}
+
+const char *BlamerBundle::IncorrectReason() const {
+  return kIncorrectResultReasonNames[incorrect_result_reason_];
+}
+
+// Functions to setup the blamer.
+// Whole word string, whole word bounding box.
+void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset,
+                                const char* truth_str, const TBOX& word_box) {
+  truth_word_.InsertBox(0, word_box);
+  truth_has_char_boxes_ = false;
+  // Encode the string as UNICHAR_IDs.
+  GenericVector<UNICHAR_ID> encoding;
+  GenericVector<char> lengths;
+  unicharset.encode_string(truth_str, false, &encoding, &lengths, NULL);
+  int total_length = 0;
+  for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
+    STRING uch(truth_str + total_length);
+    uch.truncate_at(lengths[i] - total_length);
+    UNICHAR_ID id = encoding[i];
+    if (id != INVALID_UNICHAR_ID) uch = unicharset.get_normed_unichar(id);
+    truth_text_.push_back(uch);
+  }
+}
+
+// Single "character" string, "character" bounding box.
+// May be called multiple times to indicate the characters in a word.
+void BlamerBundle::SetSymbolTruth(const UNICHARSET& unicharset,
+                                  const char* char_str, const TBOX& char_box) {
+  STRING symbol_str(char_str);
+  UNICHAR_ID id = unicharset.unichar_to_id(char_str);
+  if (id != INVALID_UNICHAR_ID) {
+    STRING normed_uch(unicharset.get_normed_unichar(id));
+    if (normed_uch.length() > 0) symbol_str = normed_uch;
+  }
+  int length = truth_word_.length();
+  truth_text_.push_back(symbol_str);
+  truth_word_.InsertBox(length, char_box);
+  if (length == 0)
+    truth_has_char_boxes_ = true;
+  else if (truth_word_.BlobBox(length - 1) == char_box)
+    truth_has_char_boxes_ = false;
+}
+
+// Marks that there is something wrong with the truth text, like it contains
+// reject characters.
+void BlamerBundle::SetRejectedTruth() {
+  incorrect_result_reason_ = IRR_NO_TRUTH;
+  truth_has_char_boxes_ = false;
+}
+
+// Returns true if the provided word_choice is correct.
+bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE* word_choice) const {
+  if (word_choice == NULL) return false;
+  const UNICHARSET* uni_set = word_choice->unicharset();
+  STRING normed_choice_str;
+  for (int i = 0; i < word_choice->length(); ++i) {
+    normed_choice_str +=
+        uni_set->get_normed_unichar(word_choice->unichar_id(i));
+  }
+  STRING truth_str = TruthString();
+  return truth_str == normed_choice_str;
+}
+
+void BlamerBundle::FillDebugString(const STRING &msg,
+                                   const WERD_CHOICE *choice,
+                                   STRING *debug) {
+  (*debug) += "Truth ";
+  for (int i = 0; i < this->truth_text_.length(); ++i) {
+    (*debug) += this->truth_text_[i];
+  }
+  if (!this->truth_has_char_boxes_) (*debug) += " (no char boxes)";
+  if (choice != NULL) {
+    (*debug) += " Choice ";
+    STRING choice_str;
+    choice->string_and_lengths(&choice_str, NULL);
+    (*debug) += choice_str;
+  }
+  if (msg.length() > 0) {
+    (*debug) += "\n";
+    (*debug) += msg;
+  }
+  (*debug) += "\n";
+}
+
+// Sets up the norm_truth_word from truth_word using the given DENORM.
+void BlamerBundle::SetupNormTruthWord(const DENORM& denorm) {
+  // TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
+  norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
+  TPOINT topleft;
+  TPOINT botright;
+  TPOINT norm_topleft;
+  TPOINT norm_botright;
+  for (int b = 0; b < truth_word_.length(); ++b) {
+    const TBOX &box = truth_word_.BlobBox(b);
+    topleft.x = box.left();
+    topleft.y = box.top();
+    botright.x = box.right();
+    botright.y = box.bottom();
+    denorm.NormTransform(NULL, topleft, &norm_topleft);
+    denorm.NormTransform(NULL, botright, &norm_botright);
+    TBOX norm_box(norm_topleft.x, norm_botright.y,
+                  norm_botright.x, norm_topleft.y);
+    norm_truth_word_.InsertBox(b, norm_box);
+  }
+}
+
+// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
+// bundles) where the right edge/ of the left-hand word is word1_right,
+// and the left edge of the right-hand word is word2_left.
+void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug,
+                               BlamerBundle* bundle1,
+                               BlamerBundle* bundle2) const {
+  STRING debug_str;
+  // Find truth boxes that correspond to the split in the blobs.
+  int b;
+  int begin2_truth_index = -1;
+  if (incorrect_result_reason_ != IRR_NO_TRUTH &&
+      truth_has_char_boxes_) {
+    debug_str = "Looking for truth split at";
+    debug_str.add_str_int(" end1_x ", word1_right);
+    debug_str.add_str_int(" begin2_x ", word2_left);
+    debug_str += "\nnorm_truth_word boxes:\n";
+    if (norm_truth_word_.length() > 1) {
+      norm_truth_word_.BlobBox(0).print_to_str(&debug_str);
+      for (b = 1; b < norm_truth_word_.length(); ++b) {
+        norm_truth_word_.BlobBox(b).print_to_str(&debug_str);
+        if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) <
+            norm_box_tolerance_) &&
+            (abs(word2_left - norm_truth_word_.BlobBox(b).left()) <
+            norm_box_tolerance_)) {
+          begin2_truth_index = b;
+          debug_str += "Split found";
+          break;
+        }
+      }
+      debug_str += '\n';
+    }
+  }
+  // Populate truth information in word and word2 with the first and second
+  // part of the original truth.
+  if (begin2_truth_index > 0) {
+    bundle1->truth_has_char_boxes_ = true;
+    bundle1->norm_box_tolerance_ = norm_box_tolerance_;
+    bundle2->truth_has_char_boxes_ = true;
+    bundle2->norm_box_tolerance_ = norm_box_tolerance_;
+    BlamerBundle *curr_bb = bundle1;
+    for (b = 0; b < norm_truth_word_.length(); ++b) {
+      if (b == begin2_truth_index) curr_bb = bundle2;
+      curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
+      curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
+      curr_bb->truth_text_.push_back(truth_text_[b]);
+    }
+  } else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
+    bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
+    bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
+  } else {
+    debug_str += "Truth split not found";
+    debug_str += truth_has_char_boxes_ ?
+        "\n" : " (no truth char boxes)\n";
+    bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
+    bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
+  }
+}
+
+// "Joins" the blames from bundle1 and bundle2 into *this.
+void BlamerBundle::JoinBlames(const BlamerBundle& bundle1,
+                              const BlamerBundle& bundle2, bool debug) {
+  STRING debug_str;
+  IncorrectResultReason irr = incorrect_result_reason_;
+  if (irr != IRR_NO_TRUTH_SPLIT) debug_str = "";
+  if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
+      bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
+      bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
+    debug_str += "Blame from part 1: ";
+    debug_str += bundle1.debug_;
+    irr = bundle1.incorrect_result_reason_;
+  }
+  if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
+      bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
+      bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
+    debug_str += "Blame from part 2: ";
+    debug_str += bundle2.debug_;
+    if (irr == IRR_CORRECT) {
+      irr = bundle2.incorrect_result_reason_;
+    } else if (irr != bundle2.incorrect_result_reason_) {
+      irr = IRR_UNKNOWN;
+    }
+  }
+  incorrect_result_reason_ = irr;
+  if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
+    SetBlame(irr, debug_str, NULL, debug);
+  }
+}
+
+// If a blob with the same bounding box as one of the truth character
+// bounding boxes is not classified as the corresponding truth character
+// blames character classifier for incorrect answer.
+void BlamerBundle::BlameClassifier(const UNICHARSET& unicharset,
+                                   const TBOX& blob_box,
+                                   const BLOB_CHOICE_LIST& choices,
+                                   bool debug) {
+  if (!truth_has_char_boxes_ ||
+      incorrect_result_reason_ != IRR_CORRECT)
+    return;  // Nothing to do here.
+
+  for (int b = 0; b < norm_truth_word_.length(); ++b) {
+    const TBOX &truth_box = norm_truth_word_.BlobBox(b);
+    // Note that we are more strict on the bounding box boundaries here
+    // than in other places (chopper, segmentation search), since we do
+    // not have the ability to check the previous and next bounding box.
+    if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_/2)) {
+      bool found = false;
+      bool incorrect_adapted = false;
+      UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
+      const char *truth_str = truth_text_[b].string();
+      // We promise not to modify the list or its contents, using a
+      // const BLOB_CHOICE* below.
+      BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST*>(&choices));
+      for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
+          choices_it.forward()) {
+        const BLOB_CHOICE* choice = choices_it.data();
+        if (strcmp(truth_str, unicharset.get_normed_unichar(
+            choice->unichar_id())) == 0) {
+          found = true;
+          break;
+        } else if (choice->IsAdapted()) {
+          incorrect_adapted = true;
+          incorrect_adapted_id = choice->unichar_id();
+        }
+      }  // end choices_it for loop
+      if (!found) {
+        STRING debug_str = "unichar ";
+        debug_str += truth_str;
+        debug_str += " not found in classification list";
+        SetBlame(IRR_CLASSIFIER, debug_str, NULL, debug);
+      } else if (incorrect_adapted) {
+        STRING debug_str = "better rating for adapted ";
+        debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
+        debug_str += " than for correct ";
+        debug_str += truth_str;
+        SetBlame(IRR_ADAPTION, debug_str, NULL, debug);
+      }
+      break;
+    }
+  }  // end iterating over blamer_bundle->norm_truth_word
+}
+
+// Checks whether chops were made at all the character bounding box
+// boundaries in word->truth_word. If not - blames the chopper for an
+// incorrect answer.
+void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) {
+  if (NoTruth() || !truth_has_char_boxes_ ||
+      word->chopped_word->blobs.empty()) {
+    return;
+  }
+  STRING debug_str;
+  bool missing_chop = false;
+  int num_blobs = word->chopped_word->blobs.size();
+  int box_index = 0;
+  int blob_index = 0;
+  inT16 truth_x;
+  while (box_index < truth_word_.length() && blob_index < num_blobs) {
+    truth_x = norm_truth_word_.BlobBox(box_index).right();
+    TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
+    if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
+      ++blob_index;
+      continue;  // encountered an extra chop, keep looking
+    } else if (curr_blob->bounding_box().right() >
+               truth_x + norm_box_tolerance_) {
+      missing_chop = true;
+      break;
+    } else {
+      ++blob_index;
+    }
+  }
+  if (missing_chop || box_index < norm_truth_word_.length()) {
+    STRING debug_str;
+    if (missing_chop) {
+      debug_str.add_str_int("Detected missing chop (tolerance=",
+                            norm_box_tolerance_);
+      debug_str += ") at Bounding Box=";
+      TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
+      curr_blob->bounding_box().print_to_str(&debug_str);
+      debug_str.add_str_int("\nNo chop for truth at x=", truth_x);
+    } else {
+      debug_str.add_str_int("Missing chops for last ",
+                            norm_truth_word_.length() - box_index);
+      debug_str += " truth box(es)";
+    }
+    debug_str += "\nMaximally chopped word boxes:\n";
+    for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
+      TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
+      curr_blob->bounding_box().print_to_str(&debug_str);
+      debug_str += '\n';
+    }
+    debug_str += "Truth  bounding  boxes:\n";
+    for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
+      norm_truth_word_.BlobBox(box_index).print_to_str(&debug_str);
+      debug_str += '\n';
+    }
+    SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
+  }
+}
+
+// Blames the classifier or the language model if, after running only the
+// chopper, best_choice is incorrect and no blame has been yet set.
+// Blames the classifier if best_choice is classifier's top choice and is a
+// dictionary word (i.e. language model could not have helped).
+// Otherwise, blames the language model (formerly permuter word adjustment).
+void BlamerBundle::BlameClassifierOrLangModel(
+    const WERD_RES* word,
+    const UNICHARSET& unicharset, bool valid_permuter, bool debug) {
+  if (valid_permuter) {
+    // Find out whether best choice is a top choice.
+    best_choice_is_dict_and_top_choice_ = true;
+    for (int i = 0; i < word->best_choice->length(); ++i) {
+      BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
+      ASSERT_HOST(!blob_choice_it.empty());
+      BLOB_CHOICE *first_choice = NULL;
+      for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
+           blob_choice_it.forward()) {  // find first non-fragment choice
+        if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
+          first_choice = blob_choice_it.data();
+          break;
+        }
+      }
+      ASSERT_HOST(first_choice != NULL);
+      if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
+        best_choice_is_dict_and_top_choice_ = false;
+        break;
+      }
+    }
+  }
+  STRING debug_str;
+  if (best_choice_is_dict_and_top_choice_) {
+    debug_str = "Best choice is: incorrect, top choice, dictionary word";
+    debug_str += " with permuter ";
+    debug_str += word->best_choice->permuter_name();
+  } else {
+    debug_str = "Classifier/Old LM tradeoff is to blame";
+  }
+  SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER
+                                              : IRR_CLASS_OLD_LM_TRADEOFF,
+           debug_str, word->best_choice, debug);
+}
+
+// Sets up the correct_segmentation_* to mark the correct bounding boxes.
+void BlamerBundle::SetupCorrectSegmentation(const TWERD* word, bool debug) {
+  params_training_bundle_.StartHypothesisList();
+  if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_)
+    return;  // Nothing to do here.
+
+  STRING debug_str;
+  debug_str += "Blamer computing correct_segmentation_cols\n";
+  int curr_box_col = 0;
+  int next_box_col = 0;
+  int num_blobs = word->NumBlobs();
+  if (num_blobs == 0) return;  // No blobs to play with.
+  int blob_index = 0;
+  inT16 next_box_x = word->blobs[blob_index]->bounding_box().right();
+  for (int truth_idx = 0; blob_index < num_blobs &&
+       truth_idx < norm_truth_word_.length();
+       ++blob_index) {
+    ++next_box_col;
+    inT16 curr_box_x = next_box_x;
+    if (blob_index + 1 < num_blobs)
+      next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
+    inT16 truth_x = norm_truth_word_.BlobBox(truth_idx).right();
+    debug_str.add_str_int("Box x coord vs. truth: ", curr_box_x);
+    debug_str.add_str_int(" ", truth_x);
+    debug_str += "\n";
+    if (curr_box_x > (truth_x + norm_box_tolerance_)) {
+      break;  // failed to find a matching box
+    } else if (curr_box_x >= truth_x - norm_box_tolerance_ &&  // matched
+               (blob_index + 1 >= num_blobs ||  // next box can't be included
+                next_box_x > truth_x + norm_box_tolerance_)) {
+      correct_segmentation_cols_.push_back(curr_box_col);
+      correct_segmentation_rows_.push_back(next_box_col-1);
+      ++truth_idx;
+      debug_str.add_str_int("col=", curr_box_col);
+      debug_str.add_str_int(" row=", next_box_col-1);
+      debug_str += "\n";
+      curr_box_col = next_box_col;
+    }
+  }
+  if (blob_index < num_blobs ||  // trailing blobs
+      correct_segmentation_cols_.length() != norm_truth_word_.length()) {
+    debug_str.add_str_int("Blamer failed to find correct segmentation"
+                          " (tolerance=", norm_box_tolerance_);
+    if (blob_index >= num_blobs) debug_str += " blob == NULL";
+    debug_str += ")\n";
+    debug_str.add_str_int(" path length ", correct_segmentation_cols_.length());
+    debug_str.add_str_int(" vs. truth ", norm_truth_word_.length());
+    debug_str += "\n";
+    SetBlame(IRR_UNKNOWN, debug_str, NULL, debug);
+    correct_segmentation_cols_.clear();
+    correct_segmentation_rows_.clear();
+  }
+}
+
+// Returns true if a guided segmentation search is needed.
+bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
+  return incorrect_result_reason_ == IRR_CORRECT &&
+      !segsearch_is_looking_for_blame_ &&
+      truth_has_char_boxes_ &&
+      !ChoiceIsCorrect(best_choice);
+}
+
+// Setup ready to guide the segmentation search to the correct segmentation.
+// The callback pp_cb is used to avoid a cyclic dependency.
+// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
+// WERD_RES, and the LMPainPoints itself.
+// pp_cb must be a permanent callback, and should be deleted by the caller.
+void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice,
+                                    MATRIX* ratings, UNICHAR_ID wildcard_id,
+                                    bool debug, STRING *debug_str,
+                                    TessResultCallback2<bool, int, int>* cb) {
+  segsearch_is_looking_for_blame_ = true;
+  if (debug) {
+    tprintf("segsearch starting to look for blame\n");
+  }
+  // Fill pain points for any unclassifed blob corresponding to the
+  // correct segmentation state.
+  *debug_str += "Correct segmentation:\n";
+  for (int idx = 0; idx < correct_segmentation_cols_.length(); ++idx) {
+    debug_str->add_str_int("col=", correct_segmentation_cols_[idx]);
+    debug_str->add_str_int(" row=", correct_segmentation_rows_[idx]);
+    *debug_str += "\n";
+    if (!ratings->Classified(correct_segmentation_cols_[idx],
+                             correct_segmentation_rows_[idx],
+                             wildcard_id) &&
+        !cb->Run(correct_segmentation_cols_[idx],
+                 correct_segmentation_rows_[idx])) {
+      segsearch_is_looking_for_blame_ = false;
+      *debug_str += "\nFailed to insert pain point\n";
+      SetBlame(IRR_SEGSEARCH_HEUR, *debug_str, best_choice, debug);
+      break;
+    }
+  }  // end for blamer_bundle->correct_segmentation_cols/rows
+}
+// Returns true if the guided segsearch is in progress.
+bool BlamerBundle::GuidedSegsearchStillGoing() const {
+  return segsearch_is_looking_for_blame_;
+}
+
+// The segmentation search has ended. Sets the blame appropriately.
+void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice,
+                                   bool debug, STRING *debug_str) {
+  // If we are still looking for blame (i.e. best_choice is incorrect, but a
+  // path representing the correct segmentation could be constructed), we can
+  // blame segmentation search pain point prioritization if the rating of the
+  // path corresponding to the correct segmentation is better than that of
+  // best_choice (i.e. language model would have done the correct thing, but
+  // because of poor pain point prioritization the correct segmentation was
+  // never explored). Otherwise we blame the tradeoff between the language model
+  // and the classifier, since even after exploring the path corresponding to
+  // the correct segmentation incorrect best_choice would have been chosen.
+  // One special case when we blame the classifier instead is when best choice
+  // is incorrect, but it is a dictionary word and it classifier's top choice.
+  if (segsearch_is_looking_for_blame_) {
+    segsearch_is_looking_for_blame_ = false;
+    if (best_choice_is_dict_and_top_choice_) {
+      *debug_str = "Best choice is: incorrect, top choice, dictionary word";
+      *debug_str += " with permuter ";
+      *debug_str += best_choice->permuter_name();
+      SetBlame(IRR_CLASSIFIER, *debug_str, best_choice, debug);
+    } else if (best_correctly_segmented_rating_ <
+        best_choice->rating()) {
+      *debug_str += "Correct segmentation state was not explored";
+      SetBlame(IRR_SEGSEARCH_PP, *debug_str, best_choice, debug);
+    } else {
+      if (best_correctly_segmented_rating_ >=
+          WERD_CHOICE::kBadRating) {
+        *debug_str += "Correct segmentation paths were pruned by LM\n";
+      } else {
+        debug_str->add_str_double("Best correct segmentation rating ",
+                                  best_correctly_segmented_rating_);
+        debug_str->add_str_double(" vs. best choice rating ",
+                                  best_choice->rating());
+      }
+      SetBlame(IRR_CLASS_LM_TRADEOFF, *debug_str, best_choice, debug);
+    }
+  }
+}
+
+// If the bundle is null or still does not indicate the correct result,
+// fix it and use some backup reason for the blame.
+void BlamerBundle::LastChanceBlame(bool debug, WERD_RES* word) {
+  if (word->blamer_bundle == NULL) {
+    word->blamer_bundle = new BlamerBundle();
+    word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame",
+                                  word->best_choice, debug);
+  } else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
+    word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
+                                  word->best_choice, debug);
+  } else {
+    bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
+    IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
+    if (irr == IRR_CORRECT && !correct) {
+      STRING debug_str = "Choice is incorrect after recognition";
+      word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice,
+                                    debug);
+    } else if (irr != IRR_CORRECT && correct) {
+      if (debug) {
+        tprintf("Corrected %s\n", word->blamer_bundle->debug_.string());
+      }
+      word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
+      word->blamer_bundle->debug_ = "";
+    }
+  }
+}
+
+// Sets the misadaption debug if this word is incorrect, as this word is
+// being adapted to.
+void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice,
+                                       bool debug) {
+  if (incorrect_result_reason_ != IRR_NO_TRUTH &&
+      !ChoiceIsCorrect(best_choice)) {
+    misadaption_debug_ ="misadapt to word (";
+    misadaption_debug_ += best_choice->permuter_name();
+    misadaption_debug_ += "): ";
+    FillDebugString("", best_choice, &misadaption_debug_);
+    if (debug) {
+      tprintf("%s\n", misadaption_debug_.string());
+    }
+  }
+}
+
--- a/ccstruct/blamer.h
+++ b/ccstruct/blamer.h
@ -0,0 +1,330 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blamer.h
+// Description: Module allowing precise error causes to be allocated.
+// Author:      Rike Antonova
+// Refactored:  Ray Smith
+// Created:     Mon Feb 04 14:37:01 PST 2013
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
+#define TESSERACT_CCSTRUCT_BLAMER_H_
+
+#include <stdio.h>
+#include "boxword.h"
+#include "genericvector.h"
+#include "matrix.h"
+#include "params_training_featdef.h"
+#include "ratngs.h"
+#include "strngs.h"
+#include "tesscallback.h"
+
+static const inT16 kBlamerBoxTolerance = 5;
+
+// Enum for expressing the source of error.
+// Note: Please update kIncorrectResultReasonNames when modifying this enum.
+enum IncorrectResultReason {
+  // The text recorded in best choice == truth text
+  IRR_CORRECT,
+  // Either: Top choice is incorrect and is a dictionary word (language model
+  // is unlikely to help correct such errors, so blame the classifier).
+  // Or: the correct unichar was not included in shortlist produced by the
+  // classifier at all.
+  IRR_CLASSIFIER,
+  // Chopper have not found one or more splits that correspond to the correct
+  // character bounding boxes recorded in BlamerBundle::truth_word.
+  IRR_CHOPPER,
+  // Classifier did include correct unichars for each blob in the correct
+  // segmentation, however its rating could have been too bad to allow the
+  // language model to pull out the correct choice. On the other hand the
+  // strength of the language model might have been too weak to favor the
+  // correct answer, this we call this case a classifier-language model
+  // tradeoff error.
+  IRR_CLASS_LM_TRADEOFF,
+  // Page layout failed to produce the correct bounding box. Blame page layout
+  // if the truth was not found for the word, which implies that the bounding
+  // box of the word was incorrect (no truth word had a similar bounding box).
+  IRR_PAGE_LAYOUT,
+  // SegSearch heuristic prevented one or more blobs from the correct
+  // segmentation state to be classified (e.g. the blob was too wide).
+  IRR_SEGSEARCH_HEUR,
+  // The correct segmentaiton state was not explored because of poor SegSearch
+  // pain point prioritization. We blame SegSearch pain point prioritization
+  // if the best rating of a choice constructed from correct segmentation is
+  // better than that of the best choice (i.e. if we got to explore the correct
+  // segmentation state, language model would have picked the correct choice).
+  IRR_SEGSEARCH_PP,
+  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
+  // and thus use the old language model (permuters).
+  // TODO(antonova): integrate the new language mode with chopper
+  IRR_CLASS_OLD_LM_TRADEOFF,
+  // If there is an incorrect adaptive template match with a better score than
+  // a correct one (either pre-trained or adapted), mark this as adaption error.
+  IRR_ADAPTION,
+  // split_and_recog_word() failed to find a suitable split in truth.
+  IRR_NO_TRUTH_SPLIT,
+  // Truth is not available for this word (e.g. when words in corrected content
+  // file are turned into ~~~~ because an appropriate alignment was not found.
+  IRR_NO_TRUTH,
+  // The text recorded in best choice != truth text, but none of the above
+  // reasons are set.
+  IRR_UNKNOWN,
+
+  IRR_NUM_REASONS
+};
+
+// Blamer-related information to determine the source of errors.
+struct BlamerBundle {
+  static const char *IncorrectReasonName(IncorrectResultReason irr);
+  BlamerBundle() : truth_has_char_boxes_(false),
+      incorrect_result_reason_(IRR_CORRECT),
+      lattice_data_(NULL) { ClearResults(); }
+  BlamerBundle(const BlamerBundle &other) {
+    this->CopyTruth(other);
+    this->CopyResults(other);
+  }
+  ~BlamerBundle() { delete[] lattice_data_; }
+
+  // Accessors.
+  STRING TruthString() const {
+    STRING truth_str;
+    for (int i = 0; i < truth_text_.length(); ++i)
+      truth_str += truth_text_[i];
+    return truth_str;
+  }
+  IncorrectResultReason incorrect_result_reason() const {
+    return incorrect_result_reason_;
+  }
+  bool NoTruth() const {
+    return incorrect_result_reason_ == IRR_NO_TRUTH ||
+           incorrect_result_reason_ == IRR_PAGE_LAYOUT;
+  }
+  bool HasDebugInfo() const {
+    return debug_.length() > 0 || misadaption_debug_.length() > 0;
+  }
+  const STRING& debug() const {
+    return debug_;
+  }
+  const STRING& misadaption_debug() const {
+    return misadaption_debug_;
+  }
+  void UpdateBestRating(float rating) {
+    if (rating < best_correctly_segmented_rating_)
+      best_correctly_segmented_rating_ = rating;
+  }
+  int correct_segmentation_length() const {
+    return correct_segmentation_cols_.length();
+  }
+  // Returns true if the given ratings matrix col,row position is included
+  // in the correct segmentation path at the given index.
+  bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
+    return correct_segmentation_cols_[index] == coord.col &&
+        correct_segmentation_rows_[index] == coord.row;
+  }
+  void set_best_choice_is_dict_and_top_choice(bool value) {
+    best_choice_is_dict_and_top_choice_ = value;
+  }
+  const char* lattice_data() const {
+    return lattice_data_;
+  }
+  int lattice_size() const {
+    return lattice_size_;  // size of lattice_data in bytes
+  }
+  void set_lattice_data(const char* data, int size) {
+    lattice_size_ = size;
+    delete [] lattice_data_;
+    lattice_data_ = new char[lattice_size_];
+    memcpy(lattice_data_, data, lattice_size_);
+  }
+  const tesseract::ParamsTrainingBundle& params_training_bundle() const {
+    return params_training_bundle_;
+  }
+  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
+  void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
+    params_training_bundle_.AddHypothesis(hypo);
+  }
+
+  // Functions to setup the blamer.
+  // Whole word string, whole word bounding box.
+  void SetWordTruth(const UNICHARSET& unicharset,
+                    const char* truth_str, const TBOX& word_box);
+  // Single "character" string, "character" bounding box.
+  // May be called multiple times to indicate the characters in a word.
+  void SetSymbolTruth(const UNICHARSET& unicharset,
+                      const char* char_str, const TBOX& char_box);
+  // Marks that there is something wrong with the truth text, like it contains
+  // reject characters.
+  void SetRejectedTruth();
+
+  // Returns true if the provided word_choice is correct.
+  bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
+
+  void ClearResults() {
+    norm_truth_word_.DeleteAllBoxes();
+    norm_box_tolerance_ = 0;
+    if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
+    debug_ = "";
+    segsearch_is_looking_for_blame_ = false;
+    best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
+    correct_segmentation_cols_.clear();
+    correct_segmentation_rows_.clear();
+    best_choice_is_dict_and_top_choice_ = false;
+    delete[] lattice_data_;
+    lattice_data_ = NULL;
+    lattice_size_ = 0;
+  }
+  void CopyTruth(const BlamerBundle &other) {
+    truth_has_char_boxes_ = other.truth_has_char_boxes_;
+    truth_word_ = other.truth_word_;
+    truth_text_ = other.truth_text_;
+    incorrect_result_reason_ =
+        (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
+  }
+  void CopyResults(const BlamerBundle &other) {
+    norm_truth_word_ = other.norm_truth_word_;
+    norm_box_tolerance_ = other.norm_box_tolerance_;
+    incorrect_result_reason_ = other.incorrect_result_reason_;
+    segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
+    best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
+    correct_segmentation_cols_ = other.correct_segmentation_cols_;
+    correct_segmentation_rows_ = other.correct_segmentation_rows_;
+    best_choice_is_dict_and_top_choice_ =
+        other.best_choice_is_dict_and_top_choice_;
+    if (other.lattice_data_ != NULL) {
+      lattice_data_ = new char[other.lattice_size_];
+      memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
+      lattice_size_ = other.lattice_size_;
+    } else {
+      lattice_data_ = NULL;
+    }
+  }
+  const char *IncorrectReason() const;
+
+  // Appends choice and truth details to the given debug string.
+  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
+                       STRING *debug);
+
+  // Sets up the norm_truth_word from truth_word using the given DENORM.
+  void SetupNormTruthWord(const DENORM& denorm);
+
+  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
+  // bundles) where the right edge/ of the left-hand word is word1_right,
+  // and the left edge of the right-hand word is word2_left.
+  void SplitBundle(int word1_right, int word2_left, bool debug,
+                   BlamerBundle* bundle1, BlamerBundle* bundle2) const;
+  // "Joins" the blames from bundle1 and bundle2 into *this.
+  void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
+                  bool debug);
+
+  // If a blob with the same bounding box as one of the truth character
+  // bounding boxes is not classified as the corresponding truth character
+  // blames character classifier for incorrect answer.
+  void BlameClassifier(const UNICHARSET& unicharset,
+                       const TBOX& blob_box,
+                       const BLOB_CHOICE_LIST& choices,
+                       bool debug);
+
+
+  // Checks whether chops were made at all the character bounding box
+  // boundaries in word->truth_word. If not - blames the chopper for an
+  // incorrect answer.
+  void SetChopperBlame(const WERD_RES* word, bool debug);
+  // Blames the classifier or the language model if, after running only the
+  // chopper, best_choice is incorrect and no blame has been yet set.
+  // Blames the classifier if best_choice is classifier's top choice and is a
+  // dictionary word (i.e. language model could not have helped).
+  // Otherwise, blames the language model (formerly permuter word adjustment).
+  void BlameClassifierOrLangModel(
+      const WERD_RES* word,
+      const UNICHARSET& unicharset, bool valid_permuter, bool debug);
+  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
+  void SetupCorrectSegmentation(const TWERD* word, bool debug);
+
+  // Returns true if a guided segmentation search is needed.
+  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
+  // Setup ready to guide the segmentation search to the correct segmentation.
+  // The callback pp_cb is used to avoid a cyclic dependency.
+  // It calls into LMPainPoints::GenerateForBlamer by pre-binding the
+  // WERD_RES, and the LMPainPoints itself.
+  // pp_cb must be a permanent callback, and should be deleted by the caller.
+  void InitForSegSearch(const WERD_CHOICE *best_choice,
+                        MATRIX* ratings, UNICHAR_ID wildcard_id,
+                        bool debug, STRING *debug_str,
+                        TessResultCallback2<bool, int, int>* pp_cb);
+  // Returns true if the guided segsearch is in progress.
+  bool GuidedSegsearchStillGoing() const;
+  // The segmentation search has ended. Sets the blame appropriately.
+  void FinishSegSearch(const WERD_CHOICE *best_choice,
+                       bool debug, STRING *debug_str);
+
+  // If the bundle is null or still does not indicate the correct result,
+  // fix it and use some backup reason for the blame.
+  static void LastChanceBlame(bool debug, WERD_RES* word);
+
+  // Sets the misadaption debug if this word is incorrect, as this word is
+  // being adapted to.
+  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
+
+ private:
+  void SetBlame(IncorrectResultReason irr, const STRING &msg,
+                const WERD_CHOICE *choice, bool debug) {
+    incorrect_result_reason_ = irr;
+    debug_ = IncorrectReason();
+    debug_ += " to blame: ";
+    FillDebugString(msg, choice, &debug_);
+    if (debug) tprintf("SetBlame(): %s", debug_.string());
+  }
+
+ private:
+  // Set to true when bounding boxes for individual unichars are recorded.
+  bool truth_has_char_boxes_;
+  // The true_word (in the original image coordinate space) contains ground
+  // truth bounding boxes for this WERD_RES.
+  tesseract::BoxWord truth_word_;
+  // Same as above, but in normalized coordinates
+  // (filled in by WERD_RES::SetupForRecognition()).
+  tesseract::BoxWord norm_truth_word_;
+  // Tolerance for bounding box comparisons in normalized space.
+  int norm_box_tolerance_;
+  // Contains ground truth unichar for each of the bounding boxes in truth_word.
+  GenericVector<STRING> truth_text_;
+  // The reason for incorrect OCR result.
+  IncorrectResultReason incorrect_result_reason_;
+  // Debug text associated with the blame.
+  STRING debug_;
+  // Misadaption debug information (filled in if this word was misadapted to).
+  STRING misadaption_debug_;
+  // Variables used by the segmentation search when looking for the blame.
+  // Set to true while segmentation search is continued after the usual
+  // termination condition in order to look for the blame.
+  bool segsearch_is_looking_for_blame_;
+  // Best rating for correctly segmented path
+  // (set and used by SegSearch when looking for blame).
+  float best_correctly_segmented_rating_;
+  // Vectors populated by SegSearch to indicate column and row indices that
+  // correspond to blobs with correct bounding boxes.
+  GenericVector<int> correct_segmentation_cols_;
+  GenericVector<int> correct_segmentation_rows_;
+  // Set to true if best choice is a dictionary word and
+  // classifier's top choice.
+  bool best_choice_is_dict_and_top_choice_;
+  // Serialized segmentation search lattice.
+  char *lattice_data_;
+  int lattice_size_;  // size of lattice_data in bytes
+  // Information about hypotheses (paths) explored by the segmentation search.
+  tesseract::ParamsTrainingBundle params_training_bundle_;
+};
+
+
+#endif  // TESSERACT_CCSTRUCT_BLAMER_H_
--- a/ccstruct/boxword.cpp
+++ b/ccstruct/boxword.cpp
@ -29,12 +29,6 @@ namespace tesseract {
 // tolerance. Otherwise, the blob may be chopped and we have to just use
 // the word bounding box.
 const int kBoxClipTolerance = 2;
-// Min offset in baseline-normalized coords to make a character a subscript.
-const int kMinSubscriptOffset = 20;
-// Min offset in baseline-normalized coords to make a character a superscript.
-const int kMinSuperscriptOffset = 20;
-// Max y of bottom of a drop-cap blob.
-const int kMaxDropCapBottom = -128;

 BoxWord::BoxWord() : length_(0) {
 }
@ -60,21 +54,17 @@ void BoxWord::CopyFrom(const BoxWord& src) {
    boxes_.push_back(src.boxes_[i]);
 }

-// Factory to build a BoxWord from a TWERD and the DENORM to switch
-// back to original image coordinates.
-// If the denorm is not NULL, then the output is denormalized and rotated
-// back to the original image coordinates.
-BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
-                                     TWERD* tessword) {
+// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
+// switch back to original image coordinates.
+BoxWord* BoxWord::CopyFromNormalized(TWERD* tessword) {
  BoxWord* boxword = new BoxWord();
  // Count the blobs.
-  boxword->length_ = 0;
-  for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next)
-    ++boxword->length_;
+  boxword->length_ = tessword->NumBlobs();
  // Allocate memory.
  boxword->boxes_.reserve(boxword->length_);

-  for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next) {
+  for (int b = 0; b < boxword->length_; ++b) {
+    TBLOB* tblob = tessword->blobs[b];
    TBOX blob_box;
    for (TESSLINE* outline = tblob->outlines; outline != NULL;
         outline = outline->next) {
@ -83,12 +73,10 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
      do {
        if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
          ICOORD pos(edgept->pos.x, edgept->pos.y);
-          if (denorm != NULL) {
-            TPOINT denormed;
-            denorm->DenormTransform(edgept->pos, &denormed);
-            pos.set_x(denormed.x);
-            pos.set_y(denormed.y);
-          }
+          TPOINT denormed;
+          tblob->denorm().DenormTransform(NULL, edgept->pos, &denormed);
+          pos.set_x(denormed.x);
+          pos.set_y(denormed.y);
          TBOX pt_box(pos, pos);
          blob_box += pt_box;
        }
@ -101,37 +89,6 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
  return boxword;
 }

-// Sets up the script_pos_ member using the tessword to get the bln
-// bounding boxes, the best_choice to get the unichars, and the unicharset
-// to get the target positions. If small_caps is true, sub/super are not
-// considered, but dropcaps are.
-void BoxWord::SetScriptPositions(const UNICHARSET& unicharset, bool small_caps,
-                                 TWERD* tessword, WERD_CHOICE* best_choice) {
-  // Allocate memory.
-  script_pos_.init_to_size(length_, SP_NORMAL);
-
-  int blob_index = 0;
-  for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next,
-       ++blob_index) {
-    int class_id = best_choice->unichar_id(blob_index);
-    TBOX blob_box = tblob->bounding_box();
-    int top = blob_box.top();
-    int bottom = blob_box.bottom();
-    int min_bottom, max_bottom, min_top, max_top;
-    unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
-                              &min_top, &max_top);
-    if (bottom <= kMaxDropCapBottom) {
-      script_pos_[blob_index] = SP_DROPCAP;
-    } else if (!small_caps) {
-      if (top + kMinSubscriptOffset < min_top) {
-        script_pos_[blob_index] = SP_SUBSCRIPT;
-      } else if (bottom - kMinSuperscriptOffset > max_bottom) {
-        script_pos_[blob_index] = SP_SUPERSCRIPT;
-      }
-    }
-  }
-}
-
 // Clean up the bounding boxes from the polygonal approximation by
 // expanding slightly, then clipping to the blobs from the original_word
 // that overlap. If not null, the block provides the inverse rotation.
@ -228,9 +185,8 @@ void BoxWord::ComputeBoundingBox() {
 // The callback is deleted on completion.
 void BoxWord::ProcessMatchedBlobs(const TWERD& other,
                                  TessCallback1<int>* cb) const {
-  TBLOB* blob = other.blobs;
-  for (int i = 0; i < length_ && blob != NULL; ++i, blob = blob->next) {
-    TBOX blob_box = blob->bounding_box();
+  for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
+    TBOX blob_box = other.blobs[i]->bounding_box();
    if (blob_box == boxes_[i])
      cb->Run(i);
  }
@ -238,5 +194,3 @@ void BoxWord::ProcessMatchedBlobs(const TWERD& other,
 }

 }  // namespace tesseract.
-
-
--- a/ccstruct/boxword.h
+++ b/ccstruct/boxword.h
@ -22,6 +22,7 @@

 #include "genericvector.h"
 #include "rect.h"
+#include "unichar.h"

 class BLOCK;
 class DENORM;
@ -34,14 +35,6 @@ class WERD_RES;

 namespace tesseract {

-// ScriptPos tells whether a character is subscript, superscript or normal.
-enum ScriptPos {
-  SP_NORMAL,
-  SP_SUBSCRIPT,
-  SP_SUPERSCRIPT,
-  SP_DROPCAP
-};
-
 // Class to hold an array of bounding boxes for an output word and
 // the bounding box of the whole word.
 class BoxWord {
@ -54,19 +47,9 @@ class BoxWord {

  void CopyFrom(const BoxWord& src);

-  // Factory to build a BoxWord from a TWERD and the DENORM to switch
-  // back to original image coordinates.
-  // If the denorm is not NULL, then the output is denormalized and rotated
-  // back to the original image coordinates.
-  static BoxWord* CopyFromNormalized(const DENORM* denorm,
-                                     TWERD* tessword);
-
-  // Sets up the script_pos_ member using the tessword to get the bln
-  // bounding boxes, the best_choice to get the unichars, and the unicharset
-  // to get the target positions. If small_caps is true, sub/super are not
-  // considered, but dropcaps are.
-  void SetScriptPositions(const UNICHARSET& unicharset, bool small_caps,
-                          TWERD* tessword, WERD_CHOICE* best_choice);
+  // Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
+  // switch back to original image coordinates.
+  static BoxWord* CopyFromNormalized(TWERD* tessword);

  // Clean up the bounding boxes from the polygonal approximation by
  // expanding slightly, then clipping to the blobs from the original_word
@ -102,11 +85,6 @@ class BoxWord {
  const TBOX& BlobBox(int index) const {
    return boxes_[index];
  }
-  ScriptPos BlobPosition(int index) const {
-    if (index < 0 || index >= script_pos_.size())
-      return SP_NORMAL;
-    return script_pos_[index];
-  }

 private:
  void ComputeBoundingBox();
@ -114,7 +92,6 @@ class BoxWord {
  TBOX bbox_;
  int length_;
  GenericVector<TBOX> boxes_;
-  GenericVector<ScriptPos> script_pos_;
 };

 }  // namespace tesseract.
--- a/ccstruct/matrix.cpp
+++ b/ccstruct/matrix.cpp
@ -32,21 +32,120 @@
 #include "tprintf.h"
 #include "unicharset.h"

+// Returns true if there are any real classification results.
+bool MATRIX::Classified(int col, int row, int wildcard_id) const {
+  if (get(col, row) == NOT_CLASSIFIED) return false;
+  BLOB_CHOICE_IT b_it(get(col, row));
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOB_CHOICE* choice = b_it.data();
+    if (choice->IsClassified())
+      return true;
+  }
+  return false;
+}
+
+// Expands the existing matrix in-place to make the band wider, without
+// losing any existing data.
+void MATRIX::IncreaseBandSize(int bandwidth) {
+  ResizeWithCopy(dimension(), bandwidth);
+}
+
+// Returns a bigger MATRIX with a new column and row in the matrix in order
+// to split the blob at the given (ind,ind) diagonal location.
+// Entries are relocated to the new MATRIX using the transformation defined
+// by MATRIX_COORD::MapForSplit.
+// Transfers the pointer data to the new MATRIX and deletes *this.
+MATRIX* MATRIX::ConsumeAndMakeBigger(int ind) {
+  int dim = dimension();
+  int band_width = bandwidth();
+  // Check to see if bandwidth needs expanding.
+  for (int col = ind; col >= 0 && col > ind - band_width; --col) {
+    if (array_[col * band_width + band_width - 1] != empty_) {
+      ++band_width;
+      break;
+    }
+  }
+  MATRIX* result = new MATRIX(dim + 1, band_width);
+
+  for (int col = 0; col < dim; ++col) {
+    for (int row = col; row < dim && row < col + bandwidth(); ++row) {
+      MATRIX_COORD coord(col, row);
+      coord.MapForSplit(ind);
+      BLOB_CHOICE_LIST* choices = get(col, row);
+      if (choices != NULL) {
+        // Correct matrix location on each choice.
+        BLOB_CHOICE_IT bc_it(choices);
+        for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+          BLOB_CHOICE* choice = bc_it.data();
+          choice->set_matrix_cell(coord.col, coord.row);
+        }
+        ASSERT_HOST(coord.Valid(*result));
+        result->put(coord.col, coord.row, choices);
+      }
+    }
+  }
+  delete this;
+  return result;
+}
+
+// Makes and returns a deep copy of *this, including all the BLOB_CHOICEs
+// on the lists, but not any LanguageModelState that may be attached to the
+// BLOB_CHOICEs.
+MATRIX* MATRIX::DeepCopy() const {
+  int dim = dimension();
+  int band_width = bandwidth();
+  MATRIX* result = new MATRIX(dim, band_width);
+  for (int col = 0; col < dim; ++col) {
+    for (int row = col; row < col + band_width; ++row) {
+      BLOB_CHOICE_LIST* choices = get(col, row);
+      if (choices != NULL) {
+        BLOB_CHOICE_LIST* copy_choices = new BLOB_CHOICE_LIST;
+        choices->deep_copy(copy_choices, &BLOB_CHOICE::deep_copy);
+        result->put(col, row, copy_choices);
+      }
+    }
+  }
+  return result;
+}
+
 // Print the best guesses out of the match rating matrix.
 void MATRIX::print(const UNICHARSET &unicharset) const {
-  tprintf("Ratings Matrix (top choices)\n");
+  tprintf("Ratings Matrix (top 3 choices)\n");
+  int dim = dimension();
+  int band_width = bandwidth();
  int row, col;
-  for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col);
+  for (col = 0; col < dim; ++col) {
+    for (row = col; row < dim && row < col + band_width; ++row) {
+      BLOB_CHOICE_LIST *rating = this->get(col, row);
+      if (rating == NOT_CLASSIFIED) continue;
+      BLOB_CHOICE_IT b_it(rating);
+      tprintf("col=%d row=%d ", col, row);
+      for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+        tprintf("%s rat=%g cert=%g " ,
+                unicharset.id_to_unichar(b_it.data()->unichar_id()),
+                b_it.data()->rating(), b_it.data()->certainty());
+      }
+      tprintf("\n");
+    }
+    tprintf("\n");
+  }
  tprintf("\n");
-  for (row = 0; row < this->dimension(); ++row) {
+  for (col = 0; col < dim; ++col) tprintf("\t%d", col);
+  tprintf("\n");
+  for (row = 0; row < dim; ++row) {
    for (col = 0; col <= row; ++col) {
      if (col == 0) tprintf("%d\t", row);
+      if (row >= col + band_width) {
+        tprintf(" \t");
+        continue;
+      }
      BLOB_CHOICE_LIST *rating = this->get(col, row);
      if (rating != NOT_CLASSIFIED) {
        BLOB_CHOICE_IT b_it(rating);
        int counter = 0;
        for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-          tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id()));
+          tprintf("%s ",
+                  unicharset.id_to_unichar(b_it.data()->unichar_id()));
          ++counter;
          if (counter == 3) break;
        }
--- a/ccstruct/matrix.h
+++ b/ccstruct/matrix.h
@ -1,5 +1,5 @@
 /* -*-C-*-
- ********************************************************************************
+ ******************************************************************************
 *
 * File:        matrix.h  (Formerly matrix.h)
 * Description:  Ratings matrix code. (Used by associator)
@ -25,18 +25,28 @@
 #ifndef TESSERACT_CCSTRUCT_MATRIX_H__
 #define TESSERACT_CCSTRUCT_MATRIX_H__

-#include "ratngs.h"
+#include "kdpair.h"
 #include "unicharset.h"

+class BLOB_CHOICE_LIST;
+
 #define NOT_CLASSIFIED reinterpret_cast<BLOB_CHOICE_LIST*>(NULL)

-// A generic class to store a matrix with entries of type T.
+// A generic class to hold a 2-D matrix with entries of type T, but can also
+// act as a base class for other implementations, such as a triangular or
+// banded matrix.
 template <class T>
 class GENERIC_2D_ARRAY {
 public:
-  // Allocate a piece of memory to hold a 2d-array of the given dimension.
-  // Initialize all the elements of the array to empty instead of assuming
-  // that a default constructor can be used.
+  // Initializes the array size, and empty element, but cannot allocate memory
+  // for the subclasses or initialize because calls to the num_elements
+  // member will be routed to the base class implementation. Subclasses can
+  // either pass the memory in, or allocate after by calling Resize().
+  GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty, T* array)
+    : empty_(empty), dim1_(dim1), dim2_(dim2), array_(array)  {
+  }
+  // Original constructor for a full rectangular matrix DOES allocate memory
+  // and initialize it to empty.
  GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty)
    : empty_(empty), dim1_(dim1), dim2_(dim2)  {
    array_ = new T[dim1_ * dim2_];
@ -44,26 +54,67 @@ class GENERIC_2D_ARRAY {
      for (int y = 0; y < dim2_; y++)
        this->put(x, y, empty_);
  }
-  ~GENERIC_2D_ARRAY() { delete[] array_; }
+  virtual ~GENERIC_2D_ARRAY() { delete[] array_; }
+
+  // Reallocate the array to the given size. Does not keep old data.
+  void Resize(int size1, int size2, const T& empty) {
+    empty_ = empty;
+    if (size1 != dim1_ || size2 != dim2_) {
+      dim1_ = size1;
+      dim2_ = size2;
+      delete [] array_;
+      array_ = new T[dim1_ * dim2_];
+    }
+    Clear();
+  }
+
+  // Reallocate the array to the given size, keeping old data.
+  void ResizeWithCopy(int size1, int size2) {
+    if (size1 != dim1_ || size2 != dim2_) {
+      T* new_array = new T[size1 * size2];
+      for (int col = 0; col < size1; ++col) {
+        for (int row = 0; row < size2; ++row) {
+          int old_index = col * dim2() + row;
+          int new_index = col * size2 + row;
+          if (col < dim1_ && row < dim2_) {
+            new_array[new_index] = array_[old_index];
+          } else {
+            new_array[new_index] = empty_;
+          }
+        }
+      }
+      delete[] array_;
+      array_ = new_array;
+      dim1_ = size1;
+      dim2_ = size2;
+    }
+  }
+
+  // Sets all the elements of the array to the empty value.
+  void Clear() {
+    int total_size = num_elements();
+    for (int i = 0; i < total_size; ++i)
+      array_[i] = empty_;
+  }

  // Writes to the given file. Returns false in case of error.
  // Only works with bitwise-serializeable types!
  bool Serialize(FILE* fp) const {
    if (!SerializeSize(fp)) return false;
    if (fwrite(&empty_, sizeof(empty_), 1, fp) != 1) return false;
-    int size = dim1_ * dim2_;
+    int size = num_elements();
    if (fwrite(array_, sizeof(*array_), size, fp) != size) return false;
    return true;
  }

  // Reads from the given file. Returns false in case of error.
-  // Only works with bitwise-serializeable types!
+  // Only works with bitwise-serializeable typ
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(bool swap, FILE* fp) {
    if (!DeSerializeSize(swap, fp)) return false;
    if (fread(&empty_, sizeof(empty_), 1, fp) != 1) return false;
    if (swap) ReverseN(&empty_, sizeof(empty_));
-    int size = dim1_ * dim2_;
+    int size = num_elements();
    if (fread(array_, sizeof(*array_), size, fp) != size) return false;
    if (swap) {
      for (int i = 0; i < size; ++i)
@ -77,7 +128,7 @@ class GENERIC_2D_ARRAY {
  bool SerializeClasses(FILE* fp) const {
    if (!SerializeSize(fp)) return false;
    if (!empty_.Serialize(fp)) return false;
-    int size = dim1_ * dim2_;
+    int size = num_elements();
    for (int i = 0; i < size; ++i) {
      if (!array_[i].Serialize(fp)) return false;
    }
@ -90,7 +141,7 @@ class GENERIC_2D_ARRAY {
  bool DeSerializeClasses(bool swap, FILE* fp) {
    if (!DeSerializeSize(swap, fp)) return false;
    if (!empty_.DeSerialize(swap, fp)) return false;
-    int size = dim1_ * dim2_;
+    int size = num_elements();
    for (int i = 0; i < size; ++i) {
      if (!array_[i].DeSerialize(swap, fp)) return false;
    }
@ -100,11 +151,14 @@ class GENERIC_2D_ARRAY {
  // Provide the dimensions of this rectangular matrix.
  int dim1() const { return dim1_; }
  int dim2() const { return dim2_; }
+  // Returns the number of elements in the array.
+  // Banded/triangular matrices may override.
+  virtual int num_elements() const { return dim1_ * dim2_; }

  // Expression to select a specific location in the matrix. The matrix is
  // stored COLUMN-major, so the left-most index is the most significant.
  // This allows [][] access to use indices in the same order as (,).
-  int index(int column, int row) const {
+  virtual int index(int column, int row) const {
    return (column * dim2_ + row);
  }

@ -129,19 +183,21 @@ class GENERIC_2D_ARRAY {
  T* operator[](int column) {
    return &array_[this->index(column, 0)];
  }
+  const T* operator[](int column) const {
+    return &array_[this->index(column, 0)];
+  }

  // Delete objects pointed to by array_[i].
  void delete_matrix_pointers() {
-    for (int x = 0; x < dim1_; x++) {
-      for (int y = 0; y < dim2_; y++) {
-        T matrix_cell = this->get(x, y);
-        if (matrix_cell != empty_)
-          delete matrix_cell;
-      }
+    int size = num_elements();
+    for (int i = 0; i < size; ++i) {
+      T matrix_cell = array_[i];
+      if (matrix_cell != empty_)
+        delete matrix_cell;
    }
  }

- private:
+ protected:
  // Factored helper to serialize the size.
  bool SerializeSize(FILE* fp) const {
    inT32 size = dim1_;
@ -160,12 +216,7 @@ class GENERIC_2D_ARRAY {
      ReverseN(&size1, sizeof(size1));
      ReverseN(&size2, sizeof(size2));
    }
-    if (size1 != dim1_ || size2 != dim2_) {
-      dim1_ = size1;
-      dim2_ = size2;
-      delete [] array_;
-      array_ = new T[dim1_ * dim2_];
-    }
+    Resize(size1, size2, empty_);
    return true;
  }

@ -175,25 +226,90 @@ class GENERIC_2D_ARRAY {
  int dim2_;  // Size of the 2nd dimension in indexing functions.
 };

-// A generic class to store a square matrix with entries of type T.
+// A generic class to store a banded triangular matrix with entries of type T.
+// In this array, the nominally square matrix is dim1_ x dim1_, and dim2_ is
+// the number of bands, INCLUDING the diagonal. The storage is thus of size
+// dim1_ * dim2_ and index(col, row) = col * dim2_ + row - col, and an
+// assert will fail if row < col or row - col >= dim2.
 template <class T>
-class GENERIC_MATRIX : public GENERIC_2D_ARRAY<T> {
+class BandTriMatrix : public GENERIC_2D_ARRAY<T> {
 public:
-  // Allocate a piece of memory to hold a matrix of the given dimension.
-  // Initialize all the elements of the matrix to empty instead of assuming
+  // Allocate a piece of memory to hold a 2d-array of the given dimension.
+  // Initialize all the elements of the array to empty instead of assuming
  // that a default constructor can be used.
-  GENERIC_MATRIX(int dimension, const T& empty)
-    : GENERIC_2D_ARRAY<T>(dimension, dimension, empty) {
+  BandTriMatrix(int dim1, int dim2, const T& empty)
+    : GENERIC_2D_ARRAY<T>(dim1, dim2, empty)  {
+  }
+  // The default destructor will do.
+
+  // Provide the dimensions of this matrix.
+  // dimension is the size of the nominally square matrix.
+  int dimension() const { return this->dim1_; }
+  // bandwidth is the number of bands in the matrix, INCLUDING the diagonal.
+  int bandwidth() const { return this->dim2_; }
+
+  // Expression to select a specific location in the matrix. The matrix is
+  // stored COLUMN-major, so the left-most index is the most significant.
+  // This allows [][] access to use indices in the same order as (,).
+  virtual int index(int column, int row) const {
+    ASSERT_HOST(row >= column);
+    ASSERT_HOST(row - column < this->dim2_);
+    return column * this->dim2_ + row - column;
  }

-  // Provide the dimension of this square matrix.
-  int dimension() const { return this->dim1(); }
+  // Appends array2 corner-to-corner to *this, making an array of dimension
+  // equal to the sum of the individual dimensions.
+  // array2 is not destroyed, but is left empty, as all elements are moved
+  // to *this.
+  void AttachOnCorner(BandTriMatrix<T>* array2) {
+    int new_dim1 = this->dim1_ + array2->dim1_;
+    int new_dim2 = MAX(this->dim2_, array2->dim2_);
+    T* new_array = new T[new_dim1 * new_dim2];
+    for (int col = 0; col < new_dim1; ++col) {
+      for (int j = 0; j < new_dim2; ++j) {
+        int new_index = col * new_dim2 + j;
+        if (col < this->dim1_ && j < this->dim2_) {
+          new_array[new_index] = this->get(col, col + j);
+        } else if (col >= this->dim1_ && j < array2->dim2_) {
+          new_array[new_index] = array2->get(col - this->dim1_,
+                                             col - this->dim1_ + j);
+          array2->put(col - this->dim1_, col - this->dim1_ + j, NULL);
+        } else {
+          new_array[new_index] = this->empty_;
+        }
+      }
+    }
+    delete[] this->array_;
+    this->array_ = new_array;
+    this->dim1_ = new_dim1;
+    this->dim2_ = new_dim2;
+  }
 };

-class MATRIX : public GENERIC_MATRIX<BLOB_CHOICE_LIST *> {
+class MATRIX : public BandTriMatrix<BLOB_CHOICE_LIST *> {
 public:
-  MATRIX(int dimension) : GENERIC_MATRIX<BLOB_CHOICE_LIST *>(dimension,
-                                                             NOT_CLASSIFIED) {}
+  MATRIX(int dimension, int bandwidth)
+    : BandTriMatrix<BLOB_CHOICE_LIST *>(dimension, bandwidth, NOT_CLASSIFIED) {}
+
+  // Returns true if there are any real classification results.
+  bool Classified(int col, int row, int wildcard_id) const;
+
+  // Expands the existing matrix in-place to make the band wider, without
+  // losing any existing data.
+  void IncreaseBandSize(int bandwidth);
+
+  // Returns a bigger MATRIX with a new column and row in the matrix in order
+  // to split the blob at the given (ind,ind) diagonal location.
+  // Entries are relocated to the new MATRIX using the transformation defined
+  // by MATRIX_COORD::MapForSplit.
+  // Transfers the pointer data to the new MATRIX and deletes *this.
+  MATRIX* ConsumeAndMakeBigger(int ind);
+
+  // Makes and returns a deep copy of *this, including all the BLOB_CHOICEs
+  // on the lists, but not any LanguageModelState that may be attached to the
+  // BLOB_CHOICEs.
+  MATRIX* DeepCopy() const;
+
  // Print a shortened version of the contents of the matrix.
  void print(const UNICHARSET &unicharset) const;
 };
@ -203,14 +319,34 @@ struct MATRIX_COORD {
    MATRIX_COORD *c = static_cast<MATRIX_COORD *>(arg);
    delete c;
  }
+  // Default constructor required by GenericHeap.
+  MATRIX_COORD() : col(0), row(0) {}
  MATRIX_COORD(int c, int r): col(c), row(r) {}
  ~MATRIX_COORD() {}
+
  bool Valid(const MATRIX &m) const {
-    return (col >= 0 && row >= 0 &&
-            col < m.dimension() && row < m.dimension());
+    return 0 <= col && col < m.dimension() &&
+           col <= row && row < col + m.bandwidth() && row < m.dimension();
  }
+
+  // Remaps the col,row pair to split the blob at the given (ind,ind) diagonal
+  // location.
+  // Entries at (i,j) for i in [0,ind] and j in [ind,dim) move to (i,j+1),
+  // making a new row at ind.
+  // Entries at (i,j) for i in [ind+1,dim) and j in [i,dim) move to (i+i,j+1),
+  // making a new column at ind+1.
+  void MapForSplit(int ind) {
+    ASSERT_HOST(row >= col);
+    if (col > ind) ++col;
+    if (row >= ind) ++row;
+    ASSERT_HOST(row >= col);
+  }
+
  int col;
  int row;
 };

+// The MatrixCoordPair contains a MATRIX_COORD and its priority.
+typedef tesseract::KDPairInc<float, MATRIX_COORD> MatrixCoordPair;
+
 #endif  // TESSERACT_CCSTRUCT_MATRIX_H__
--- a/ccstruct/ocrblock.cpp
+++ b/ccstruct/ocrblock.cpp
@ -472,6 +472,8 @@ void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list,
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
+    if (block->poly_block() != NULL && !block->poly_block()->IsText())
+      continue;  // Don't touch non-text blocks.
    // Iterate over all rows in the block.
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
--- a/ccstruct/pageres.h
+++ b/ccstruct/pageres.h
@ -19,6 +19,7 @@
 #ifndef           PAGERES_H
 #define           PAGERES_H

+#include "blamer.h"
 #include "blobs.h"
 #include "boxword.h"
 #include "elst.h"
@ -38,167 +39,6 @@ class Tesseract;
 }
 using tesseract::FontInfo;

-static const inT16 kBlamerBoxTolerance = 5;
-
-// Enum for expressing the source of error.
-// Note: Please update kIncorrectResultReasonNames when modifying this enum.
-enum IncorrectResultReason {
-  // The text recorded in best choice == truth text
-  IRR_CORRECT,
-  // Either: Top choice is incorrect and is a dictionary word (language model
-  // is unlikely to help correct such errors, so blame the classifier).
-  // Or: the correct unichar was not included in shortlist produced by the
-  // classifier at all.
-  IRR_CLASSIFIER,
-  // Chopper have not found one or more splits that correspond to the correct
-  // character bounding boxes recorded in BlamerBundle::truth_word.
-  IRR_CHOPPER,
-  // Classifier did include correct unichars for each blob in the correct
-  // segmentation, however its rating could have been too bad to allow the
-  // language model to pull out the correct choice. On the other hand the
-  // strength of the language model might have been too weak to favor the
-  // correct answer, this we call this case a classifier-language model
-  // tradeoff error.
-  IRR_CLASS_LM_TRADEOFF,
-  // Page layout failed to produce the correct bounding box. Blame page layout
-  // if the truth was not found for the word, which implies that the bounding
-  // box of the word was incorrect (no truth word had a similar bounding box).
-  IRR_PAGE_LAYOUT,
-  // SegSearch heuristic prevented one or more blobs from the correct
-  // segmentation state to be classified (e.g. the blob was too wide).
-  IRR_SEGSEARCH_HEUR,
-  // The correct segmentaiton state was not explored because of poor SegSearch
-  // pain point prioritization. We blame SegSearch pain point prioritization
-  // if the best rating of a choice constructed from correct segmentation is
-  // better than that of the best choice (i.e. if we got to explore the correct
-  // segmentation state, language model would have picked the correct choice).
-  IRR_SEGSEARCH_PP,
-  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
-
-  // and thus use the old language model (permuters).
-  // TODO(antonova): integrate the new language mode with chopper
-  IRR_CLASS_OLD_LM_TRADEOFF,
-  // If there is an incorrect adaptive template match with a better score than
-  // a correct one (either pre-trained or adapted), mark this as adaption error.
-  IRR_ADAPTION,
-  // split_and_recog_word() failed to find a suitable split in truth.
-  IRR_NO_TRUTH_SPLIT,
-  // Truth is not available for this word (e.g. when words in corrected content
-  // file are turned into ~~~~ because an appropriate alignment was not found.
-  IRR_NO_TRUTH,
-  // The text recorded in best choice != truth text, but none of the above
-  // reasons are set.
-  IRR_UNKNOWN,
-
-  IRR_NUM_REASONS
-};
-
-// Blamer-related information to determine the source of errors.
-struct BlamerBundle {
-  static const char *IncorrectReasonName(IncorrectResultReason irr);
-  BlamerBundle() : truth_has_char_boxes(false),
-      incorrect_result_reason(IRR_CORRECT),
-      lattice_data(NULL) { ClearResults(); }
-  ~BlamerBundle() { delete[] lattice_data; }
-  void ClearResults() {
-    norm_truth_word.DeleteAllBoxes();
-    norm_box_tolerance = 0;
-    if (!NoTruth()) incorrect_result_reason = IRR_CORRECT;
-    debug = "";
-    segsearch_is_looking_for_blame = false;
-    best_correctly_segmented_rating = WERD_CHOICE::kBadRating;
-    correct_segmentation_cols.clear();
-    correct_segmentation_rows.clear();
-    best_choice_is_dict_and_top_choice = false;
-    delete[] lattice_data;
-    lattice_data = NULL;
-    lattice_size = 0;
-  }
-  void CopyTruth(const BlamerBundle &other) {
-    truth_has_char_boxes = other.truth_has_char_boxes;
-    truth_word = other.truth_word;
-    truth_text = other.truth_text;
-    incorrect_result_reason =
-        (other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT);
-  }
-  void CopyResults(const BlamerBundle &other) {
-    norm_truth_word = other.norm_truth_word;
-    norm_box_tolerance = other.norm_box_tolerance;
-    incorrect_result_reason = other.incorrect_result_reason;
-    segsearch_is_looking_for_blame = other.segsearch_is_looking_for_blame;
-    best_correctly_segmented_rating =other.best_correctly_segmented_rating;
-    correct_segmentation_cols = other.correct_segmentation_cols;
-    correct_segmentation_rows = other.correct_segmentation_rows;
-    best_choice_is_dict_and_top_choice =
-        other.best_choice_is_dict_and_top_choice;
-    if (other.lattice_data != NULL) {
-      lattice_data = new char[other.lattice_size];
-      memcpy(lattice_data, other.lattice_data, other.lattice_size);
-      lattice_size = other.lattice_size;
-    } else {
-      lattice_data = NULL;
-    }
-  }
-  BlamerBundle(const BlamerBundle &other) {
-    this->CopyTruth(other);
-    this->CopyResults(other);
-  }
-  const char *IncorrectReason() const;
-  bool NoTruth() const {
-    return (incorrect_result_reason == IRR_NO_TRUTH ||
-             incorrect_result_reason == IRR_PAGE_LAYOUT);
-  }
-  void SetBlame(IncorrectResultReason irr,
-                const STRING &msg, const WERD_CHOICE *choice, bool debug) {
-    this->incorrect_result_reason = irr;
-    this->debug = this->IncorrectReason();
-    this->debug += " to blame: ";
-    this->FillDebugString(msg, choice, &(this->debug));
-    if (debug) tprintf("SetBlame(): %s", this->debug.string());
-  }
-  // Appends choice and truth details to the given debug string.
-  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
-                       STRING *debug);
-
-  // Set to true when bounding boxes for individual unichars are recorded.
-  bool truth_has_char_boxes;
-  // The true_word (in the original image coordinate space) contains ground
-  // truth bounding boxes for this WERD_RES.
-  tesseract::BoxWord truth_word;
-  // Same as above, but in normalized coordinates
-  // (filled in by WERD_RES::SetupForRecognition()).
-  tesseract::BoxWord norm_truth_word;
-  // Tolerance for bounding box comparisons in normalized space.
-  int norm_box_tolerance;
-  // Contains ground truth unichar for each of the bounding boxes in truth_word.
-  GenericVector<STRING> truth_text;
-  // The reason for incorrect OCR result.
-  IncorrectResultReason incorrect_result_reason;
-  // Debug text associated with the blame.
-  STRING debug;
-  // Misadaption debug information (filled in if this word was misadapted to).
-  STRING misadaption_debug;
-  // Variables used by the segmentation search when looking for the blame.
-  // Set to true while segmentation search is continued after the usual
-  // termination condition in order to look for the blame.
-  bool segsearch_is_looking_for_blame;
-  // Best rating for correctly segmented path
-  // (set and used by SegSearch when looking for blame).
-  float best_correctly_segmented_rating;
-  // Vectors populated by SegSearch to indicate column and row indices that
-  // correspond to blobs with correct bounding boxes.
-  GenericVector<int> correct_segmentation_cols;
-  GenericVector<int> correct_segmentation_rows;
-  // Set to true if best choice is a dictionary word and
-  // classifier's top choice.
-  bool best_choice_is_dict_and_top_choice;
-  // Serialized segmentation search lattice.
-  char *lattice_data;
-  int lattice_size;  // size of lattice_data in bytes
-  // Information about hypotheses (paths) explored by the segmentation search.
-  tesseract::ParamsTrainingBundle params_training_bundle;
-};
-
 /* Forward declarations */

 class BLOCK_RES;
@ -341,8 +181,11 @@ class WERD_RES : public ELIST_LINK {
  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
  // if it doesn't.
  tesseract::BoxWord* bln_boxes;  // BLN input bounding boxes.
+  // The ROW that this word sits in. NOT owned by the WERD_RES.
+  ROW* blob_row;
  // The denorm provides the transformation to get back to the rotated image
-  // coords from the chopped_word/rebuild_word BLN coords.
+  // coords from the chopped_word/rebuild_word BLN coords, but each blob also
+  // has its own denorm.
  DENORM denorm;                  // For use on chopped_word.
  // Unicharset used by the classifier output in best_choice and raw_choice.
  const UNICHARSET* uch_set;  // For converting back to utf8.
@ -355,13 +198,32 @@ class WERD_RES : public ELIST_LINK {
  // character fragments that make up the word.
  // The length of chopped_word matches length of seam_array + 1 (if set).
  TWERD* chopped_word;            // BLN chopped fragments output.
-  SEAMS seam_array;               // Seams matching chopped_word.
-  WERD_CHOICE *best_choice;       // tess output
-  WERD_CHOICE *raw_choice;        // top choice permuter
-  // Alternative paths found during chopping/segmentation search stages
-  // (the first entry being a slim copy of best_choice).
-  GenericVector<WERD_CHOICE *> alt_choices;
-  GenericVector<GenericVector<int> > alt_states;
+  // Vector of SEAM* holding chopping points matching chopped_word.
+  GenericVector<SEAM*> seam_array;
+  // Widths of blobs in chopped_word.
+  GenericVector<int> blob_widths;
+  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
+  // blob i and blob i+1.
+  GenericVector<int> blob_gaps;
+  // Ratings matrix contains classifier choices for each classified combination
+  // of blobs. The dimension is the same as the number of blobs in chopped_word
+  // and the leading diagonal corresponds to classifier results of the blobs
+  // in chopped_word. The state_ members of best_choice, raw_choice and
+  // best_choices all correspond to this ratings matrix and allow extraction
+  // of the blob choices for any given WERD_CHOICE.
+  MATRIX* ratings;                // Owned pointer.
+  // Pointer to the first WERD_CHOICE in best_choices. This is the result that
+  // will be output from Tesseract. Note that this is now a borrowed pointer
+  // and should NOT be deleted.
+  WERD_CHOICE* best_choice;       // Borrowed pointer.
+  // The best raw_choice found during segmentation search. Differs from the
+  // best_choice by being the best result according to just the character
+  // classifier, not taking any language model information into account.
+  // Unlike best_choice, the pointer IS owned by this WERD_RES.
+  WERD_CHOICE* raw_choice;        // Owned pointer.
+  // Alternative results found during chopping/segmentation search stages.
+  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
+  WERD_CHOICE_LIST best_choices;

  // Truth bounding boxes, text and incorrect choice reason.
  BlamerBundle *blamer_bundle;
@ -462,6 +324,8 @@ class WERD_RES : public ELIST_LINK {
    InitPointers();
    word = the_word;
  }
+  // Deep copies everything except the ratings MATRIX.
+  // To get that use deep_copy below.
  WERD_RES(const WERD_RES &source) {
    InitPointers();
    *this = source;            // see operator=
@ -545,7 +409,11 @@ class WERD_RES : public ELIST_LINK {
  void InitPointers();
  void Clear();
  void ClearResults();
+  void ClearWordChoices();
+  void ClearRatings();

+  // Deep copies everything except the ratings MATRIX.
+  // To get that use deep_copy below.
  WERD_RES& operator=(const WERD_RES& source);  //from this

  void CopySimpleFields(const WERD_RES& source);
@ -557,18 +425,28 @@ class WERD_RES : public ELIST_LINK {
  void InitForRetryRecognition(const WERD_RES& source);

  // Sets up the members used in recognition: bln_boxes, chopped_word,
-  // seam_array, denorm, best_choice, raw_choice.  Returns false if
+  // seam_array, denorm.  Returns false if
  // the word is empty and sets up fake results.  If use_body_size is
  // true and row->body_size is set, then body_size will be used for
  // blob normalization instead of xheight + ascrise. This flag is for
  // those languages that are using CJK pitch model and thus it has to
  // be true if and only if tesseract->textord_use_cjk_fp_model is
  // true.
+  // If allow_detailed_fx is true, the feature extractor will receive fine
+  // precision outline information, allowing smoother features and better
+  // features on low resolution images.
+  // Returns false if the word is empty and sets up fake results.
  bool SetupForTessRecognition(const UNICHARSET& unicharset_in,
                               tesseract::Tesseract* tesseract, Pix* pix,
                               bool numeric_mode, bool use_body_size,
+                               bool allow_detailed_fx,
                               ROW *row, BLOCK* block);

+  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
+  // accumulators from a made chopped word.  We presume the fields are already
+  // empty.
+  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
+
  // Sets up the members used in recognition:
  // bln_boxes, chopped_word, seam_array, denorm.
  // Returns false if the word is empty and sets up fake results.
@ -586,6 +464,87 @@ class WERD_RES : public ELIST_LINK {
  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
  void SetupBlamerBundle();

+  // Computes the blob_widths and blob_gaps from the chopped_word.
+  void SetupBlobWidthsAndGaps();
+
+  // Updates internal data to account for a new SEAM (chop) at the given
+  // blob_number. Fixes the ratings matrix and states in the choices, as well
+  // as the blob widths and gaps.
+  void InsertSeam(int blob_number, SEAM* seam);
+
+  // Returns true if all the word choices except the first have adjust_factors
+  // worse than the given threshold.
+  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
+
+  // Returns true if the current word is ambiguous (by number of answers or
+  // by dangerous ambigs.)
+  bool IsAmbiguous();
+
+  // Returns true if the ratings matrix size matches the sum of each of the
+  // segmentation states.
+  bool StatesAllValid();
+
+  // Prints a list of words found if debug is true or the word result matches
+  // the word_to_debug.
+  void DebugWordChoices(bool debug, const char* word_to_debug);
+
+  // Removes from best_choices all choices which are not within a reasonable
+  // range of the best choice.
+  void FilterWordChoices(int debug_level);
+
+  // Computes a set of distance thresholds used to control adaption.
+  // Compares the best choice for the current word to the best raw choice
+  // to determine which characters were classified incorrectly by the
+  // classifier. Then places a separate threshold into thresholds for each
+  // character in the word. If the classifier was correct, max_rating is placed
+  // into thresholds. If the classifier was incorrect, the mean match rating
+  // (error percentage) of the classifier's incorrect choice minus some margin
+  // is placed into thresholds. This can then be used by the caller to try to
+  // create a new template for the desired class that will classify the
+  // character with a rating better than the threshold value. The match rating
+  // placed into thresholds is never allowed to be below min_rating in order to
+  // prevent trying to make overly tight templates.
+  // min_rating limits how tight to make a template.
+  // max_rating limits how loose to make a template.
+  // rating_margin denotes the amount of margin to put in template.
+  void ComputeAdaptionThresholds(float certainty_scale,
+                                 float min_rating,
+                                 float max_rating,
+                                 float rating_margin,
+                                 float* thresholds);
+
+  // Saves a copy of the word_choice if it has the best unadjusted rating.
+  // Returns true if the word_choice was the new best.
+  bool LogNewRawChoice(WERD_CHOICE* word_choice);
+  // Consumes word_choice by adding it to best_choices, (taking ownership) if
+  // the certainty for word_choice is some distance of the best choice in
+  // best_choices, or by deleting the word_choice and returning false.
+  // The best_choices list is kept in sorted order by rating. Duplicates are
+  // removed, and the list is kept no longer than max_num_choices in length.
+  // Returns true if the word_choice is still a valid pointer.
+  bool LogNewCookedChoice(int max_num_choices, bool debug,
+                          WERD_CHOICE* word_choice);
+
+  // Prints a brief list of all the best choices.
+  void PrintBestChoices() const;
+
+  // Returns the sum of the widths of the blob between start_blob and last_blob
+  // inclusive.
+  int GetBlobsWidth(int start_blob, int last_blob);
+  // Returns the width of a gap between the specified blob and the next one.
+  int GetBlobsGap(int blob_index);
+
+  // Returns the BLOB_CHOICE corresponding to the given index in the
+  // best choice word taken from the appropriate cell in the ratings MATRIX.
+  // Borrowed pointer, so do not delete. May return NULL if there is no
+  // BLOB_CHOICE matching the unichar_id at the given index.
+  BLOB_CHOICE* GetBlobChoice(int index) const;
+
+  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
+  // best choice word taken from the appropriate cell in the ratings MATRIX.
+  // Borrowed pointer, so do not delete.
+  BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
+
  // Moves the results fields from word to this. This takes ownership of all
  // the data, so src can be destructed.
  // word1.ConsumeWordResult(word);
@ -597,10 +556,11 @@ class WERD_RES : public ELIST_LINK {
  void ConsumeWordResults(WERD_RES* word);

  // Replace the best choice and rebuild box word.
-  void ReplaceBestChoice(const WERD_CHOICE& choice,
-                         const GenericVector<int> &segmentation_state);
+  // choice must be from the current best_choices list.
+  void ReplaceBestChoice(WERD_CHOICE* choice);

-  // Builds the rebuild_word from the chopped_word and the best_state.
+  // Builds the rebuild_word and sets the best_state from the chopped_word and
+  // the best_choice->state.
  void RebuildBestState();

  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
@ -610,30 +570,26 @@ class WERD_RES : public ELIST_LINK {
  // Sets/replaces the box_word with one made from the rebuild_word.
  void SetupBoxWord();

-  // Sets up the script positions in the output boxword using the best_choice
+  // Sets up the script positions in the best_choice using the best_choice
  // to get the unichars, and the unicharset to get the target positions.
  void SetScriptPositions();
-
-  // Returns the indices [start, end) containing the core of the word, stripped
-  // of any superscript digits on either side.
-  // (i.e., the non-footnote part of the word).
-  // Assumes that BoxWord is all set up for best_choice.
-  void WithoutFootnoteSpan(int *start, int *end) const;
-
-  // Given an alternate word choice and segmentation state, yield the indices
-  // [start, end) containig the core of the word, stripped of any superscript
-  // digits on either side.  (i.e. stripping off the footnote parts).
-  void WithoutFootnoteSpan(
-      const WERD_CHOICE &choice, const GenericVector<int> &state,
-      int *start, int *end) const;
+  // Sets all the blobs in all the words (best choice and alternates) to be
+  // the given position. (When a sub/superscript is recognized as a separate
+  // word, it falls victim to the rule that a whole word cannot be sub or
+  // superscript, so this function overrides that problem.)
+  void SetAllScriptPositions(tesseract::ScriptPos position);

  // Classifies the word with some already-calculated BLOB_CHOICEs.
  // The choices are an array of blob_count pointers to BLOB_CHOICE,
  // providing a single classifier result for each blob.
  // The BLOB_CHOICEs are consumed and the word takes ownership.
-  // The number of blobs in the outword must match blob_count.
+  // The number of blobs in the box_word must match blob_count.
  void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);

+  // Creates a WERD_CHOICE for the word using the top choices from the leading
+  // diagonal of the ratings matrix.
+  void FakeWordFromRatings();
+
  // Copies the best_choice strings to the correct_text for adaption/training.
  void BestChoiceToCorrectText();

@ -644,13 +600,16 @@ class WERD_RES : public ELIST_LINK {
  // Returns true if anything was merged.
  bool ConditionalBlobMerge(
      TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
-      TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
-      BLOB_CHOICE_LIST_CLIST *blob_choices);
+      TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb);
+
+  // Merges 2 adjacent blobs in the result (index and index+1) and corrects
+  // all the data to account for the change.
+  void MergeAdjacentBlobs(int index);

  // Callback helper for fix_quotes returns a double quote if both
  // arguments are quote, otherwise INVALID_UNICHAR_ID.
  UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
-  void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices);
+  void fix_quotes();

  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
@ -658,15 +617,21 @@ class WERD_RES : public ELIST_LINK {
  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
  // (assuming both on the same textline, are in order and a chopped em dash.)
  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
-  void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices);
+  void fix_hyphens();

  // Callback helper for merge_tess_fails returns a space if both
  // arguments are space, otherwise INVALID_UNICHAR_ID.
  UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
  void merge_tess_fails();

+  // Returns a really deep copy of *src, including the ratings MATRIX.
  static WERD_RES* deep_copy(const WERD_RES* src) {
-    return new WERD_RES(*src);
+    WERD_RES* result = new WERD_RES(*src);
+    // That didn't copy the ratings, but we want a copy if there is one to
+    // begin width.
+    if (src->ratings != NULL)
+      result->ratings = src->ratings->DeepCopy();
+    return result;
  }

  // Copy blobs from word_res onto this word (eliminating spaces between).
--- a/ccstruct/params_training_featdef.cpp
+++ b/ccstruct/params_training_featdef.cpp
@ -0,0 +1,40 @@
+///////////////////////////////////////////////////////////////////////
+// File:        params_training_featdef.cpp
+// Description: Utility functions for params training features.
+// Author:      David Eger
+// Created:     Mon Jun 11 11:26:42 PDT 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <string.h>
+
+#include "params_training_featdef.h"
+
+namespace tesseract {
+
+int ParamsTrainingFeatureByName(const char *name) {
+  if (name == NULL)
+    return -1;
+  int array_size = sizeof(kParamsTrainingFeatureTypeName) /
+    sizeof(kParamsTrainingFeatureTypeName[0]);
+  for (int i = 0; i < array_size; i++) {
+    if (kParamsTrainingFeatureTypeName[i] == NULL)
+      continue;
+    if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0)
+      return i;
+  }
+  return -1;
+}
+
+}  // namespace tesseract
--- a/ccstruct/params_training_featdef.h
+++ b/ccstruct/params_training_featdef.h
@ -25,67 +25,97 @@

 namespace tesseract {

+// Maximum number of unichars in the small and medium sized words
+static const int kMaxSmallWordUnichars = 3;
+static const int kMaxMediumWordUnichars = 6;
+
 // Raw features extracted from a single OCR hypothesis.
-// The features are non-normalized real-valued quantities with
-// unbounded range and unknown distribution.
+// The features are normalized (by outline length or number of unichars as
+// appropriate) real-valued quantities with unbounded range and
+// unknown distribution.
 // Normalization / binarization of these features is done at a later stage.
 // Note: when adding new fields to this enum make sure to modify
-// kParamsTrainingRawFeatureTypeName enum accordingly.
-enum ParamsTrainingRawFeatureType {
-  // What dictionary (if any) was this hypothesis found in.
-  // See PermuterType enum in ccstruct/ratngs.h for interpretation.
-  PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE,     // 0
-  // Boolean indicator of whether this hypothesis is ambiguous to a known
-  // dictionary word (or a valid number pattern).
-  PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH,  // 1
-  // Shape cost of the segmentation path for this hypothesis.
-  PTRAIN_RAW_FEATURE_SHAPE_COST,          // 2
-  // Character ngram probability of the string of unichars of this hypothesis.
-  PTRAIN_RAW_FEATURE_NGRAM_PROB,          // 3
-  // Number of bad/inconsistent spots in this hypothesis.
-  PTRAIN_RAW_FEATURE_NUM_BAD_PUNC,        // 4
-  PTRAIN_RAW_FEATURE_NUM_BAD_CASE,        // 5
-  PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE,   // 6
-  PTRAIN_RAW_FEATURE_NUM_BAD_SPACING,     // 7
-  PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT,      // 8
-  PTRAIN_RAW_FEATURE_NUM_BAD_FONT,        // 9
-  // Classifier-related features.
-  PTRAIN_RAW_FEATURE_WORST_CERT,          // 10
-  PTRAIN_RAW_FEATURE_RATING,              // 11
-  // Number of classifier results that came from adapted templates.
-  PTRAIN_RAW_FEATURE_ADAPTED,   // 12
-  // Features potentially useful for normalization.
-  PTRAIN_RAW_FEATURE_NUM_UNICHARS,        // 13
-  PTRAIN_RAW_FEATURE_OUTLINE_LEN,         // 14
+// kParamsTrainingFeatureTypeName
+enum kParamsTrainingFeatureType {
+  // Digits
+  PTRAIN_DIGITS_SHORT,             // 0
+  PTRAIN_DIGITS_MED,               // 1
+  PTRAIN_DIGITS_LONG,              // 2
+  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
+  PTRAIN_NUM_SHORT,                // 3
+  PTRAIN_NUM_MED,                  // 4
+  PTRAIN_NUM_LONG,                 // 5
+  // Document word (DOC_DAWG_PERM)
+  PTRAIN_DOC_SHORT,                // 6
+  PTRAIN_DOC_MED,                  // 7
+  PTRAIN_DOC_LONG,                 // 8
+  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
+  PTRAIN_DICT_SHORT,               // 9
+  PTRAIN_DICT_MED,                 // 10
+  PTRAIN_DICT_LONG,                // 11
+  // Frequent word (FREQ_DAWG_PERM)
+  PTRAIN_FREQ_SHORT,               // 12
+  PTRAIN_FREQ_MED,                 // 13
+  PTRAIN_FREQ_LONG,                // 14
+  PTRAIN_SHAPE_COST_PER_CHAR,      // 15
+  PTRAIN_NGRAM_COST_PER_CHAR,      // 16
+  PTRAIN_NUM_BAD_PUNC,             // 17
+  PTRAIN_NUM_BAD_CASE,             // 18
+  PTRAIN_XHEIGHT_CONSISTENCY,      // 19
+  PTRAIN_NUM_BAD_CHAR_TYPE,        // 20
+  PTRAIN_NUM_BAD_SPACING,          // 21
+  PTRAIN_NUM_BAD_FONT,             // 22
+  PTRAIN_RATING_PER_CHAR,          // 23

-  PTRAIN_NUM_RAW_FEATURE_TYPES
+  PTRAIN_NUM_FEATURE_TYPES
 };

-static const char * const kParamsTrainingRawFeatureTypeName[] = {
-    "DICT_MATCH_TYPE",     // 0
-    "UNAMBIG_DICT_MATCH",  // 1
-    "SHAPE_COST",          // 2
-    "NGRAM_PROB",          // 3
-    "NUM_BAD_PUNC",        // 4
-    "NUM_BAD_CASE",        // 5
-    "NUM_BAD_CHAR_TYPE",   // 6
-    "NUM_BAD_SPACING",     // 7
-    "NUM_BAD_SCRIPT",      // 8
-    "NUM_BAD_FONT",        // 9
-    "WORST_CERT",          // 10
-    "RATING",              // 11
-    "ADAPTED",             // 12
-    "NUM_UNICHARS",        // 13
-    "OUTLINE_LEN",         // 14
+static const char * const kParamsTrainingFeatureTypeName[] = {
+    "PTRAIN_DIGITS_SHORT",             // 0
+    "PTRAIN_DIGITS_MED",               // 1
+    "PTRAIN_DIGITS_LONG",              // 2
+    "PTRAIN_NUM_SHORT",                // 3
+    "PTRAIN_NUM_MED",                  // 4
+    "PTRAIN_NUM_LONG",                 // 5
+    "PTRAIN_DOC_SHORT",                // 6
+    "PTRAIN_DOC_MED",                  // 7
+    "PTRAIN_DOC_LONG",                 // 8
+    "PTRAIN_DICT_SHORT",               // 9
+    "PTRAIN_DICT_MED",                 // 10
+    "PTRAIN_DICT_LONG",                // 11
+    "PTRAIN_FREQ_SHORT",               // 12
+    "PTRAIN_FREQ_MED",                 // 13
+    "PTRAIN_FREQ_LONG",                // 14
+    "PTRAIN_SHAPE_COST_PER_CHAR",      // 15
+    "PTRAIN_NGRAM_COST_PER_CHAR",      // 16
+    "PTRAIN_NUM_BAD_PUNC",             // 17
+    "PTRAIN_NUM_BAD_CASE",             // 18
+    "PTRAIN_XHEIGHT_CONSISTENCY",      // 19
+    "PTRAIN_NUM_BAD_CHAR_TYPE",        // 20
+    "PTRAIN_NUM_BAD_SPACING",          // 21
+    "PTRAIN_NUM_BAD_FONT",             // 22
+    "PTRAIN_RATING_PER_CHAR",          // 23
 };

+// Returns the index of the given feature (by name),
+// or -1 meaning the feature is unknown.
+int ParamsTrainingFeatureByName(const char *name);
+
+
 // Entry with features extracted from a single OCR hypothesis for a word.
 struct ParamsTrainingHypothesis {
-  ParamsTrainingHypothesis() {
-    for (int i = 0; i < PTRAIN_NUM_RAW_FEATURE_TYPES; ++i) features[i] = 0.0;
+  ParamsTrainingHypothesis() : cost(0.0) {
+    memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
  }
-  float features[PTRAIN_NUM_RAW_FEATURE_TYPES];
+  ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {
+    memcpy(features, other.features,
+           sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
+    str = other.str;
+    cost = other.cost;
+  }
+  float features[PTRAIN_NUM_FEATURE_TYPES];
  STRING str;  // string corresponding to word hypothesis (for debugging)
+  float cost;  // path cost computed by segsearch
 };

 // A list of hypotheses explored during one run of segmentation search.
@ -104,9 +134,10 @@ class ParamsTrainingBundle {
  }
  // Adds a new ParamsTrainingHypothesis to the current hypothesis list
  // and returns the reference to the newly added entry.
-  ParamsTrainingHypothesis &AddHypothesis() {
+  ParamsTrainingHypothesis &AddHypothesis(
+      const ParamsTrainingHypothesis &other) {
    if (hyp_list_vec.empty()) StartHypothesisList();
-    hyp_list_vec.back().push_back(ParamsTrainingHypothesis());
+    hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
    return hyp_list_vec.back().back();
  }

--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@ -19,13 +19,33 @@

 #include "ratngs.h"

+#include "blobs.h"
 #include "callcpp.h"
 #include "genericvector.h"
+#include "matrix.h"
+#include "normalis.h"  // kBlnBaselineOffset.
 #include "unicharset.h"

-ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE);
+using tesseract::ScriptPos;
+
+ELISTIZE(BLOB_CHOICE);
+ELISTIZE(WERD_CHOICE);

 const float WERD_CHOICE::kBadRating = 100000.0;
+// Min offset in baseline-normalized coords to make a character a subscript.
+const int kMinSubscriptOffset = 20;
+// Min offset in baseline-normalized coords to make a character a superscript.
+const int kMinSuperscriptOffset = 20;
+// Max y of bottom of a drop-cap blob.
+const int kMaxDropCapBottom = -128;
+// Max fraction of x-height to use as denominator in measuring x-height overlap.
+const double kMaxOverlapDenominator = 0.125;
+// Min fraction of x-height range that should be in agreement for matching
+// x-heights.
+const double kMinXHeightMatch = 0.5;
+// Max tolerance on baseline position as a fraction of x-height for matching
+// baselines.
+const double kMaxBaselineDrift = 0.0625;

 static const char kPermuterTypeNoPerm[] = "None";
 static const char kPermuterTypePuncPerm[] = "Punctuation";
@ -68,20 +88,20 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
                         inT16 src_fontinfo_id,     // font
                         inT16 src_fontinfo_id2,    // 2nd choice font
                         int src_script_id,        // script
-                         inT16 min_xheight,        // min xheight allowed
-                         inT16 max_xheight,        // max xheight by this char
-                         bool adapted              // adapted match or not
-                        ) {
+                         float min_xheight,        // min xheight allowed
+                         float max_xheight,        // max xheight by this char
+                         float yshift,             // yshift out of position
+                         BlobChoiceClassifier c) {  // adapted match or other
  unichar_id_ = src_unichar_id;
  rating_ = src_rating;
  certainty_ = src_cert;
  fontinfo_id_ = src_fontinfo_id;
  fontinfo_id2_ = src_fontinfo_id2;
  script_id_ = src_script_id;
-  language_model_state_ = NULL;
  min_xheight_ = min_xheight;
  max_xheight_ = max_xheight;
-  adapted_ = adapted;
+  yshift_ = yshift;
+  classifier_ = c;
 }

 /**
@ -96,12 +116,75 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  fontinfo_id_ = other.fontinfo_id();
  fontinfo_id2_ = other.fontinfo_id2();
  script_id_ = other.script_id();
-  language_model_state_ = NULL;
+  matrix_cell_ = other.matrix_cell_;
  min_xheight_ = other.min_xheight_;
  max_xheight_ = other.max_xheight_;
-  adapted_ = other.adapted_;
+  yshift_ = other.yshift();
+  classifier_ = other.classifier_;
 }

+// Returns true if *this and other agree on the baseline and x-height
+// to within some tolerance based on a given estimate of the x-height.
+bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
+                                  bool debug) const {
+  double baseline_diff = fabs(yshift() - other.yshift());
+  if (baseline_diff > kMaxBaselineDrift * x_height) {
+    if (debug) {
+      tprintf("Baseline diff %g for %d v %d\n",
+              baseline_diff, unichar_id_, other.unichar_id_);
+    }
+    return false;
+  }
+  double this_range = max_xheight() - min_xheight();
+  double other_range = other.max_xheight() - other.min_xheight();
+  double denominator = ClipToRange(MIN(this_range, other_range),
+                                   1.0, kMaxOverlapDenominator * x_height);
+  double overlap = MIN(max_xheight(), other.max_xheight()) -
+                   MAX(min_xheight(), other.min_xheight());
+  overlap /= denominator;
+  if (debug) {
+    tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
+            unichar_id_, other.unichar_id_, baseline_diff,
+            this_range, other_range, denominator, overlap);
+  }
+
+  return overlap >= kMinXHeightMatch;
+}
+
+// Helper to find the BLOB_CHOICE in the bc_list that matches the given
+// unichar_id, or NULL if there is no match.
+BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
+                                BLOB_CHOICE_LIST* bc_list) {
+  // Find the corresponding best BLOB_CHOICE.
+  BLOB_CHOICE_IT choice_it(bc_list);
+  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+       choice_it.forward()) {
+    BLOB_CHOICE* choice = choice_it.data();
+    if (choice->unichar_id() == char_id) {
+      return choice;
+    }
+  }
+  return NULL;
+}
+
+const char *WERD_CHOICE::permuter_name(uinT8 permuter) {
+  return kPermuterTypeNames[permuter];
+}
+
+namespace tesseract {
+
+const char *ScriptPosToString(enum ScriptPos script_pos) {
+  switch (script_pos) {
+    case SP_NORMAL: return "NORM";
+    case SP_SUBSCRIPT: return "SUB";
+    case SP_SUPERSCRIPT: return "SUPER";
+    case SP_DROPCAP: return "DROPC";
+  }
+  return "SP_UNKNOWN";
+}
+
+}  // namespace tesseract.
+
 /**
 * WERD_CHOICE::WERD_CHOICE
 *
@ -111,16 +194,13 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
 WERD_CHOICE::WERD_CHOICE(const char *src_string,
                         const UNICHARSET &unicharset)
    : unicharset_(&unicharset){
-  STRING src_lengths;
-  const char *ptr = src_string;
-  const char *end = src_string + strlen(src_string);
-  int step = unicharset.step(ptr);
-  for (; ptr < end && step > 0;
-       step = unicharset.step(ptr), src_lengths += step, ptr += step);
-  if (step != 0 && ptr == end) {
-    this->init(src_string, src_lengths.string(),
-               0.0, 0.0, NO_PERM);
-  } else {  // there must have been an invalid unichar in the string
+  GenericVector<UNICHAR_ID> encoding;
+  GenericVector<char> lengths;
+  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
+    lengths.push_back('\0');
+    STRING src_lengths = &lengths[0];
+    this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
+  } else {  // There must have been an invalid unichar in the string.
    this->init(8);
    this->make_bad();
  }
@ -152,13 +232,16 @@ void WERD_CHOICE::init(const char *src_string,
      int unichar_length = src_lengths ? src_lengths[i] : 1;
      unichar_ids_[i] =
          unicharset_->unichar_to_id(src_string+offset, unichar_length);
-      fragment_lengths_[i] = 1;
+      state_[i] = 1;
+      certainties_[i] = src_certainty;
      offset += unichar_length;
    }
  }
+  adjust_factor_ = 1.0f;
  rating_ = src_rating;
  certainty_ = src_certainty;
  permuter_ = src_permuter;
+  dangerous_ambig_found_ = false;
 }

 /**
@ -166,25 +249,46 @@ void WERD_CHOICE::init(const char *src_string,
 */
 WERD_CHOICE::~WERD_CHOICE() {
  delete[] unichar_ids_;
-  delete[] fragment_lengths_;
-  delete_blob_choices();
+  delete[] script_pos_;
+  delete[] state_;
+  delete[] certainties_;
 }

 const char *WERD_CHOICE::permuter_name() const {
  return kPermuterTypeNames[permuter_];
 }

-/**
- * WERD_CHOICE::set_blob_choices
- *
- * Delete current blob_choices. Set the blob_choices to the given new
- * list.
- */
-void WERD_CHOICE::set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  if (blob_choices_ != blob_choices) {
-    delete_blob_choices();
-    blob_choices_ = blob_choices;
+// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
+// taken from the appropriate cell in the ratings MATRIX.
+// Borrowed pointer, so do not delete.
+BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
+  MATRIX_COORD coord = MatrixCoord(index);
+  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
+  if (result == NULL) {
+    result = new BLOB_CHOICE_LIST;
+    ratings->put(coord.col, coord.row, result);
  }
+  return result;
+}
+
+// Returns the MATRIX_COORD corresponding to the location in the ratings
+// MATRIX for the given index into the word.
+MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
+  int col = 0;
+  for (int i = 0; i < index; ++i)
+    col += state_[i];
+  int row = col + state_[index] - 1;
+  return MATRIX_COORD(col, row);
+}
+
+// Sets the entries for the given index from the BLOB_CHOICE, assuming
+// unit fragment lengths, but setting the state for this index to blob_count.
+void WERD_CHOICE::set_blob_choice(int index, int blob_count,
+                                  const BLOB_CHOICE* blob_choice) {
+  unichar_ids_[index] = blob_choice->unichar_id();
+  script_pos_[index] = tesseract::SP_NORMAL;
+  state_[index] = blob_count;
+  certainties_[index] = blob_choice->certainty();
 }


@ -211,9 +315,18 @@ bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
 */
 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
  ASSERT_HOST(start >= 0 && start + num <= length_);
-  for (int i = start; i+num < length_; ++i) {
-    unichar_ids_[i] = unichar_ids_[i+num];
-    fragment_lengths_[i] = fragment_lengths_[i+num];
+  // Accumulate the states to account for the merged blobs.
+  for (int i = 0; i < num; ++i) {
+    if (start > 0)
+      state_[start - 1] += state_[start + i];
+    else if (start + num < length_)
+      state_[start + num] += state_[start + i];
+  }
+  for (int i = start; i + num < length_; ++i) {
+    unichar_ids_[i] = unichar_ids_[i + num];
+    script_pos_[i] = script_pos_[i + num];
+    state_[i] = state_[i + num];
+    certainties_[i] = certainties_[i + num];
  }
  length_ -= num;
 }
@ -224,7 +337,7 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
 * Reverses and mirrors unichars in unichar_ids.
 */
 void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
-  for (int i = 0; i < length_/2; ++i) {
+  for (int i = 0; i < length_ / 2; ++i) {
    UNICHAR_ID tmp_id = unichar_ids_[i];
    unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
    unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
@ -255,6 +368,23 @@ void WERD_CHOICE::punct_stripped(int *start, int *end) const {
  (*end)++;
 }

+void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
+  int end = length();
+  while (end > 0 &&
+         unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
+         BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
+    end--;
+  }
+  int start = 0;
+  while (start < end &&
+         unicharset_->get_isdigit(unichar_ids_[start]) &&
+         BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
+    start++;
+  }
+  *pstart = start;
+  *pend = end;
+}
+
 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
  ASSERT_HOST(start >= 0 && start <= length_);
  ASSERT_HOST(end >= 0 && end <= length_);
@ -262,7 +392,7 @@ WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
  WERD_CHOICE retval(unicharset_, end - start);
  for (int i = start; i < end; i++) {
    retval.append_unichar_id_space_allocated(
-        unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f);
+        unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
  }
  return retval;
 }
@ -310,12 +440,12 @@ void WERD_CHOICE::string_and_lengths(STRING *word_str,
 * and call append_unichar_id_space_allocated().
 */
 void WERD_CHOICE::append_unichar_id(
-    UNICHAR_ID unichar_id, char fragment_length,
+    UNICHAR_ID unichar_id, int blob_count,
    float rating, float certainty) {
  if (length_ == reserved_) {
    this->double_the_size();
  }
-  this->append_unichar_id_space_allocated(unichar_id, fragment_length,
+  this->append_unichar_id_space_allocated(unichar_id, blob_count,
                                          rating, certainty);
 }

@ -327,59 +457,31 @@ void WERD_CHOICE::append_unichar_id(
 * If the permuters are NOT the same the permuter is set to COMPOUND_PERM
 */
 WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
-  // TODO(daria): find out why the choice was cleared this way if any
-  // of the pieces are empty. Add the description of this behavior
-  // to the comments.
-  // if (word_string.length () == 0 || second.word_string.length () == 0) {
-  //   word_string = NULL;          //make it empty
-  //   word_lengths = NULL;
-  //   delete_blob_choices();
-  // } else {
  ASSERT_HOST(unicharset_ == second.unicharset_);
  while (reserved_ < length_ + second.length()) {
    this->double_the_size();
  }
  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
-  const char *other_fragment_lengths = second.fragment_lengths();
  for (int i = 0; i < second.length(); ++i) {
    unichar_ids_[length_ + i] = other_unichar_ids[i];
-    fragment_lengths_[length_ + i] = other_fragment_lengths[i];
+    state_[length_ + i] = second.state_[i];
+    certainties_[length_ + i] = second.certainties_[i];
+    script_pos_[length_ + i] = second.BlobPosition(i);
  }
  length_ += second.length();
+  if (second.adjust_factor_ > adjust_factor_)
+    adjust_factor_ = second.adjust_factor_;
  rating_ += second.rating();  // add ratings
  if (second.certainty() < certainty_) // take min
    certainty_ = second.certainty();
+  if (second.dangerous_ambig_found_)
+    dangerous_ambig_found_ = true;
  if (permuter_ == NO_PERM) {
    permuter_ = second.permuter();
  } else if (second.permuter() != NO_PERM &&
             second.permuter() != permuter_) {
    permuter_ = COMPOUND_PERM;
  }
-
-  // Append a deep copy of second blob_choices if it exists.
-  if (second.blob_choices_ != NULL) {
-    if (this->blob_choices_ == NULL)
-      this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST;
-
-    BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
-    BLOB_CHOICE_LIST_C_IT second_blob_choices_it;
-
-    this_blob_choices_it.set_to_list(this->blob_choices_);
-    this_blob_choices_it.move_to_last();
-
-    second_blob_choices_it.set_to_list(second.blob_choices_);
-
-    for (second_blob_choices_it.mark_cycle_pt();
-         !second_blob_choices_it.cycled_list();
-         second_blob_choices_it.forward()) {
-
-      BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
-      blob_choices_copy->deep_copy(second_blob_choices_it.data(),
-                                   &BLOB_CHOICE::deep_copy);
-
-      this_blob_choices_it.add_after_then_move(blob_choices_copy);
-    }
-  }
  return *this;
 }

@ -397,55 +499,202 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {

  unicharset_ = source.unicharset_;
  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
-  const char *other_fragment_lengths = source.fragment_lengths();
  for (int i = 0; i < source.length(); ++i) {
    unichar_ids_[i] = other_unichar_ids[i];
-    fragment_lengths_[i] = other_fragment_lengths[i];
+    state_[i] = source.state_[i];
+    certainties_[i] = source.certainties_[i];
+    script_pos_[i] = source.BlobPosition(i);
  }
  length_ = source.length();
+  adjust_factor_ = source.adjust_factor_;
  rating_ = source.rating();
  certainty_ = source.certainty();
+  min_x_height_ = source.min_x_height();
+  max_x_height_ = source.max_x_height();
  permuter_ = source.permuter();
-  fragment_mark_ = source.fragment_mark();
-
-  // Delete existing blob_choices
-  this->delete_blob_choices();
-
-  // Deep copy blob_choices of source
-  if (source.blob_choices_ != NULL) {
-    BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
-    BLOB_CHOICE_LIST_C_IT source_blob_choices_it;
-
-    this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST();
-
-    this_blob_choices_it.set_to_list(this->blob_choices_);
-    source_blob_choices_it.set_to_list(source.blob_choices_);
-
-    for (source_blob_choices_it.mark_cycle_pt();
-         !source_blob_choices_it.cycled_list();
-         source_blob_choices_it.forward()) {
-
-      BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
-      blob_choices_copy->deep_copy(source_blob_choices_it.data(),
-                                   &BLOB_CHOICE::deep_copy);
-
-      this_blob_choices_it.add_after_then_move(blob_choices_copy);
-    }
-  }
+  dangerous_ambig_found_ = source.dangerous_ambig_found_;
  return *this;
 }

-/**********************************************************************
- * WERD_CHOICE::delete_blob_choices
- *
- * Clear the blob_choices list, delete it and set it to NULL.
- **********************************************************************/
-void WERD_CHOICE::delete_blob_choices() {
-  if (blob_choices_ != NULL) {
-    blob_choices_->deep_clear();
-    delete blob_choices_;
-    blob_choices_ = NULL;
+// Sets up the script_pos_ member using the blobs_list to get the bln
+// bounding boxes, *this to get the unichars, and this->unicharset
+// to get the target positions. If small_caps is true, sub/super are not
+// considered, but dropcaps are.
+// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
+void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) {
+  // Since WERD_CHOICE isn't supposed to depend on a Tesseract,
+  // we don't have easy access to the flags Tesseract stores.  Therefore, debug
+  // for this module is hard compiled in.
+  int debug = 0;
+
+  // Initialize to normal.
+  for (int i = 0; i < length_; ++i)
+    script_pos_[i] = tesseract::SP_NORMAL;
+  if (word->blobs.empty())
+    return;
+
+  int position_counts[4];
+  for (int i = 0; i < 4; i++) {
+    position_counts[i] = 0;
  }
+
+  int chunk_index = 0;
+  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
+    TBLOB* tblob = word->blobs[chunk_index];
+    int uni_id = unichar_id(blob_index);
+    TBOX blob_box = tblob->bounding_box();
+    if (state_ != NULL) {
+      for (int i = 1; i <  state_[blob_index]; ++i) {
+        ++chunk_index;
+        tblob = word->blobs[chunk_index];
+        blob_box += tblob->bounding_box();
+      }
+    }
+    script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
+                                               uni_id);
+    if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
+      script_pos_[blob_index] = tesseract::SP_NORMAL;
+    }
+    position_counts[script_pos_[blob_index]]++;
+  }
+  // If almost everything looks like a superscript or subscript,
+  // we most likely just got the baseline wrong.
+  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
+      position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
+    if (debug >= 2) {
+      tprintf("Most characters of %s are subscript or superscript.\n"
+              "That seems wrong, so I'll assume we got the baseline wrong\n",
+              unichar_string().string());
+    }
+    for (int i = 0; i < length_; i++) {
+      ScriptPos sp = script_pos_[i];
+      if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
+        position_counts[sp]--;
+        position_counts[tesseract::SP_NORMAL]++;
+        script_pos_[i] = tesseract::SP_NORMAL;
+      }
+    }
+  }
+
+  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
+      debug >= 2) {
+    tprintf("SetScriptPosition on %s\n", unichar_string().string());
+    int chunk_index = 0;
+    for (int blob_index = 0; blob_index < length_; ++blob_index) {
+      if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
+        TBLOB* tblob = word->blobs[chunk_index];
+        ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
+                         unichar_id(blob_index));
+      }
+      chunk_index += state_ != NULL ? state_[blob_index] : 1;
+    }
+  }
+}
+// Sets the script_pos_ member from some source positions with a given length.
+void WERD_CHOICE::SetScriptPositions(const tesseract::ScriptPos* positions,
+                                     int length) {
+  ASSERT_HOST(length == length_);
+  if (positions != script_pos_) {
+    delete [] script_pos_;
+    script_pos_ = new ScriptPos[length];
+    memcpy(script_pos_, positions, sizeof(positions[0]) * length);
+  }
+}
+// Sets all the script_pos_ positions to the given position.
+void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
+  for (int i = 0; i < length_; ++i)
+    script_pos_[i] = position;
+}
+
+/* static */
+ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug,
+                                        const UNICHARSET& unicharset,
+                                        const TBOX& blob_box,
+                                        UNICHAR_ID unichar_id) {
+  ScriptPos retval = tesseract::SP_NORMAL;
+  int top = blob_box.top();
+  int bottom = blob_box.bottom();
+  int min_bottom, max_bottom, min_top, max_top;
+  unicharset.get_top_bottom(unichar_id,
+                            &min_bottom, &max_bottom,
+                            &min_top, &max_top);
+
+  int sub_thresh_top = min_top - kMinSubscriptOffset;
+  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
+  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
+  if (bottom <= kMaxDropCapBottom) {
+    retval = tesseract::SP_DROPCAP;
+  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
+    retval = tesseract::SP_SUBSCRIPT;
+  } else if (bottom > sup_thresh_bot) {
+    retval = tesseract::SP_SUPERSCRIPT;
+  }
+
+  if (print_debug) {
+    const char *pos = ScriptPosToString(retval);
+    tprintf("%s Character %s[bot:%d top: %d]  "
+            "bot_range[%d,%d]  top_range[%d, %d] "
+            "sub_thresh[bot:%d top:%d]  sup_thresh_bot %d\n",
+            pos, unicharset.id_to_unichar(unichar_id),
+            bottom, top,
+            min_bottom, max_bottom, min_top, max_top,
+            sub_thresh_bot, sub_thresh_top,
+            sup_thresh_bot);
+  }
+  return retval;
+}
+
+// Returns the script-id (eg Han) of the dominant script in the word.
+int WERD_CHOICE::GetTopScriptID() const {
+  int max_script = unicharset_->get_script_table_size();
+  int *sid = new int[max_script];
+  int x;
+  for (x = 0; x < max_script; x++) sid[x] = 0;
+  for (x = 0; x < length_; ++x) {
+    int script_id = unicharset_->get_script(unichar_id(x));
+    sid[script_id]++;
+  }
+  if (unicharset_->han_sid() != unicharset_->null_sid()) {
+    // Add the Hiragana & Katakana counts to Han and zero them out.
+    if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
+      sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
+      sid[unicharset_->hiragana_sid()] = 0;
+    }
+    if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
+      sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
+      sid[unicharset_->katakana_sid()] = 0;
+    }
+  }
+  // Note that high script ID overrides lower one on a tie, thus biasing
+  // towards non-Common script (if sorted that way in unicharset file).
+  int max_sid = 0;
+  for (x = 1; x < max_script; x++)
+    if (sid[x] >= sid[max_sid]) max_sid = x;
+  if (sid[max_sid] < length_ / 2)
+    max_sid = unicharset_->null_sid();
+  delete[] sid;
+  return max_sid;
+}
+
+// Fixes the state_ for a chop at the given blob_posiiton.
+void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
+  int total_chunks = 0;
+  for (int i = 0; i < length_; ++i) {
+    total_chunks += state_[i];
+    if (total_chunks > blob_position) {
+      ++state_[i];
+      return;
+    }
+  }
+}
+
+// Returns the sum of all the state elements, being the total number of blobs.
+int WERD_CHOICE::TotalOfStates() const {
+  int total_chunks = 0;
+  for (int i = 0; i < length_; ++i) {
+    total_chunks += state_[i];
+  }
+  return total_chunks;
 }

 /**
@ -453,32 +702,87 @@ void WERD_CHOICE::delete_blob_choices() {
 *
 * Print WERD_CHOICE to stdout.
 */
-const void WERD_CHOICE::print(const char *msg) const {
-  tprintf("%s WERD_CHOICE:\n", msg);
-  tprintf("length_ %d reserved_ %d permuter_ %d\n",
-         length_, reserved_, permuter_);
-  tprintf("rating_ %.4f certainty_ %.4f", rating_, certainty_);
-  if (fragment_mark_) {
-    tprintf(" fragment_mark_ true");
+void WERD_CHOICE::print(const char *msg) const {
+  tprintf("%s : ", msg);
+  for (int i = 0; i < length_; ++i) {
+    tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
+  }
+  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
+          rating_, certainty_, adjust_factor_, permuter_,
+          min_x_height_, max_x_height_, dangerous_ambig_found_);
+  tprintf("pos");
+  for (int i = 0; i < length_; ++i) {
+    tprintf("\t%s", ScriptPosToString(script_pos_[i]));
+  }
+  tprintf("\nstr");
+  for (int i = 0; i < length_; ++i) {
+    tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
+  }
+  tprintf("\nstate:");
+  for (int i = 0; i < length_; ++i) {
+    tprintf("\t%d ", state_[i]);
+  }
+  tprintf("\nC");
+  for (int i = 0; i < length_; ++i) {
+    tprintf("\t%.3f", certainties_[i]);
  }
  tprintf("\n");
-  if (unichar_string_.length() > 0) {
-    tprintf("unichar_string_ %s unichar_lengths_ %s\n",
-            unichar_string_.string(), unichar_lengths_.string());
-  }
-  tprintf("unichar_ids: ");
-  int i;
-  for (i = 0; i < length_; ++i) {
-    tprintf("%d ", unichar_ids_[i]);
-  }
-  tprintf("\nfragment_lengths_: ");
-  for (i = 0; i < length_; ++i) {
-    tprintf("%d ", fragment_lengths_[i]);
-  }
-  tprintf("\n");
-  fflush(stdout);
 }

+// Prints the segmentation state with an introductory message.
+void WERD_CHOICE::print_state(const char *msg) const {
+  tprintf("%s", msg);
+  for (int i = 0; i < length_; ++i)
+    tprintf(" %d", state_[i]);
+  tprintf("\n");
+}
+
+// Displays the segmentation state of *this (if not the same as the last
+// one displayed) and waits for a click in the window.
+void WERD_CHOICE::DisplaySegmentation(TWERD* word) {
+#ifndef GRAPHICS_DISABLED
+  // Number of different colors to draw with.
+  const int kNumColors = 6;
+  static ScrollView *segm_window = NULL;
+  // Check the state against the static prev_drawn_state.
+  static GenericVector<int> prev_drawn_state;
+  bool already_done = prev_drawn_state.size() == length_;
+  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
+  for (int i = 0; i < length_; ++i) {
+    if (prev_drawn_state[i] != state_[i]) {
+      already_done = false;
+    }
+    prev_drawn_state[i] = state_[i];
+  }
+  if (already_done || word->blobs.empty()) return;
+
+  // Create the window if needed.
+  if (segm_window == NULL) {
+    segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
+                                 2000.0, 256.0, true);
+  } else {
+    segm_window->Clear();
+  }
+
+  TBOX bbox;
+  int blob_index = 0;
+  for (int c = 0; c < length_; ++c) {
+    ScrollView::Color color =
+        static_cast<ScrollView::Color>(c % kNumColors + 3);
+    for (int i = 0; i < state_[c]; ++i, ++blob_index) {
+      TBLOB* blob = word->blobs[blob_index];
+      bbox += blob->bounding_box();
+      blob->plot(segm_window, color, color);
+    }
+  }
+  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
+                               bbox.right(), bbox.bottom());
+  segm_window->Update();
+  window_wait(segm_window);
+#endif
+}
+
+
 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
                                       const WERD_CHOICE &word2) {
  const UNICHARSET *uchset = word1.unicharset();
@ -526,114 +830,3 @@ void print_ratings_list(const char *msg,
  tprintf("\n");
  fflush(stdout);
 }
-
-/**
- * print_ratings_list
- *
- * Print ratings list (unichar ids only).
- */
-void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings) {
-  if (ratings->length() == 0) {
-    tprintf("%s:<none>\n", msg);
-    return;
-  }
-  if (*msg != '\0') {
-    tprintf("%s\n", msg);
-  }
-  BLOB_CHOICE_IT c_it;
-  c_it.set_to_list(ratings);
-  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
-    c_it.data()->print(NULL);
-    if (!c_it.at_last()) tprintf("\n");
-  }
-  tprintf("\n");
-  fflush(stdout);
-}
-
-/**
- * print_ratings_info
- *
- * Send all the ratings out to the logfile.
- *
- * @param fp file to use
- * @param ratings list of results
- * @param current_unicharset unicharset that can be used
- * for id-to-unichar conversion
- */
-void print_ratings_info(FILE *fp,
-                        BLOB_CHOICE_LIST *ratings,
-                        const UNICHARSET &current_unicharset) {
-  inT32 index;                    // to list
-  const char* first_char = NULL;  // character
-  FLOAT32 first_rat;              // rating
-  FLOAT32 first_cert;             // certainty
-  const char* sec_char = NULL;    // character
-  FLOAT32 sec_rat = 0.0f;         // rating
-  FLOAT32 sec_cert = 0.0f;        // certainty
-  BLOB_CHOICE_IT c_it = ratings;  // iterator
-
-  index = ratings->length();
-  if (index > 0) {
-    first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
-    first_rat = c_it.data()->rating();
-    first_cert = -c_it.data()->certainty();
-    if (index > 1) {
-      sec_char = current_unicharset.id_to_unichar(
-          c_it.data_relative(1)->unichar_id());
-      sec_rat = c_it.data_relative(1)->rating();
-      sec_cert = -c_it.data_relative(1)->certainty();
-    } else {
-      sec_char = NULL;
-      sec_rat = -1;
-      sec_cert = -1;
-    }
-  } else {
-    first_char = NULL;
-    first_rat = -1;
-    first_cert = -1;
-  }
-  if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
-    first_char = NULL;
-  if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
-    sec_char = NULL;
-  tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n",
-          ratings->length(),
-          first_char != NULL ? first_char : "~",
-          first_rat, first_cert, sec_char != NULL ? sec_char : "~",
-          sec_rat, sec_cert);
-}
-
-/**
- * print_char_choices_list
- */
-void print_char_choices_list(const char *msg,
-                             const BLOB_CHOICE_LIST_VECTOR &char_choices,
-                             const UNICHARSET &current_unicharset,
-                             BOOL8 detailed) {
-  if (*msg != '\0') tprintf("%s\n", msg);
-  for (int x = 0; x < char_choices.length(); ++x) {
-    BLOB_CHOICE_IT c_it;
-    c_it.set_to_list(char_choices.get(x));
-    tprintf("\nchar[%d]: %s\n", x,
-            current_unicharset.debug_str( c_it.data()->unichar_id()).string());
-    if (detailed)
-      print_ratings_list("", char_choices.get(x), current_unicharset);
-  }
-}
-
-/**
- * print_word_alternates_list
- */
-void print_word_alternates_list(
-    WERD_CHOICE *word,
-    GenericVector<WERD_CHOICE *> *alternates) {
-  if (!word || !alternates) return;
-
-  STRING alternates_str;
-  for (int i = 0; i < alternates->size(); i++) {
-    if (i > 0) alternates_str += "\", \"";
-    alternates_str += alternates->get(i)->unichar_string();
-  }
-  tprintf("Alternates for \"%s\": {\"%s\"}\n",
-          word->unichar_string().string(), alternates_str.string());
-}
--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@ -23,11 +23,27 @@
 #include <assert.h>

 #include "clst.h"
+#include "elst.h"
 #include "genericvector.h"
+#include "matrix.h"
 #include "unichar.h"
 #include "unicharset.h"
 #include "werd.h"

+class MATRIX;
+class TBLOB;
+class TWERD;
+
+// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
+// whether a blob has been classified by inspecting the BLOB_CHOICEs.
+enum BlobChoiceClassifier {
+  BCC_STATIC_CLASSIFIER,   // From the char_norm classifier.
+  BCC_ADAPTED_CLASSIFIER,  // From the adaptive classifier.
+  BCC_SPECKLE_CLASSIFIER,  // Backup for failed classification.
+  BCC_AMBIG,               // Generated by ambiguity detection.
+  BCC_FAKE,                // From some other process.
+};
+
 class BLOB_CHOICE: public ELIST_LINK
 {
  public:
@ -38,20 +54,23 @@ class BLOB_CHOICE: public ELIST_LINK
      rating_ = MAX_FLOAT32;
      certainty_ = -MAX_FLOAT32;
      script_id_ = -1;
-      language_model_state_ = NULL;
-      min_xheight_ = 0;
-      max_xheight_ = 0;
-      adapted_ = false;
+      xgap_before_ = 0;
+      xgap_after_ = 0;
+      min_xheight_ = 0.0f;
+      max_xheight_ = 0.0f;
+      yshift_ = 0.0f;
+      classifier_ = BCC_FAKE;
    }
    BLOB_CHOICE(UNICHAR_ID src_unichar_id,  // character id
                float src_rating,          // rating
                float src_cert,            // certainty
-                inT16 src_fontinfo_id,      // font
-                inT16 src_fontinfo_id2,     // 2nd choice font
+                inT16 src_fontinfo_id,     // font
+                inT16 src_fontinfo_id2,    // 2nd choice font
                int script_id,             // script
-                inT16 min_xheight,         // min xheight in image pixel units
-                inT16 max_xheight,         // max xheight allowed by this char
-                bool adapted);             // adapted match or not
+                float min_xheight,         // min xheight in image pixel units
+                float max_xheight,         // max xheight allowed by this char
+                float yshift,           // the larger of y shift (top or bottom)
+                BlobChoiceClassifier c);   // adapted match or other
    BLOB_CHOICE(const BLOB_CHOICE &other);
    ~BLOB_CHOICE() {}

@ -73,8 +92,8 @@ class BLOB_CHOICE: public ELIST_LINK
    int script_id() const {
      return script_id_;
    }
-    void *language_model_state() {
-      return language_model_state_;
+    const MATRIX_COORD& matrix_cell() {
+      return matrix_cell_;
    }
    inT16 xgap_before() const {
      return xgap_before_;
@ -82,14 +101,25 @@ class BLOB_CHOICE: public ELIST_LINK
    inT16 xgap_after() const {
      return xgap_after_;
    }
-    inT16 min_xheight() const {
+    float min_xheight() const {
      return min_xheight_;
    }
-    inT16 max_xheight() const {
+    float max_xheight() const {
      return max_xheight_;
    }
-    bool adapted() const {
-      return adapted_;
+    float yshift() const {
+      return yshift_;
+    }
+    BlobChoiceClassifier classifier() const {
+      return classifier_;
+    }
+    bool IsAdapted() const {
+      return classifier_ == BCC_ADAPTED_CLASSIFIER;
+    }
+    bool IsClassified() const {
+      return classifier_ == BCC_STATIC_CLASSIFIER ||
+             classifier_ == BCC_ADAPTED_CLASSIFIER ||
+             classifier_ == BCC_SPECKLE_CLASSIFIER;
    }

    void set_unichar_id(UNICHAR_ID newunichar_id) {
@ -110,8 +140,9 @@ class BLOB_CHOICE: public ELIST_LINK
    void set_script(int newscript_id) {
      script_id_ = newscript_id;
    }
-    void set_language_model_state(void *language_model_state) {
-      language_model_state_ = language_model_state;
+    void set_matrix_cell(int col, int row) {
+      matrix_cell_.col = col;
+      matrix_cell_.row = row;
    }
    void set_xgap_before(inT16 gap) {
      xgap_before_ = gap;
@ -119,19 +150,39 @@ class BLOB_CHOICE: public ELIST_LINK
    void set_xgap_after(inT16 gap) {
      xgap_after_ = gap;
    }
-    void set_adapted(bool adapted) {
-      adapted_ = adapted;
+    void set_classifier(BlobChoiceClassifier classifier) {
+      classifier_ = classifier;
    }
    static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
      BLOB_CHOICE* choice = new BLOB_CHOICE;
      *choice = *src;
      return choice;
    }
-    void print(const UNICHARSET *unicharset) {
-      tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
+    // Returns true if *this and other agree on the baseline and x-height
+    // to within some tolerance based on a given estimate of the x-height.
+    bool PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
+                         bool debug) const;
+
+    void print(const UNICHARSET *unicharset) const {
+      tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
+              rating_, certainty_,
+              min_xheight_, max_xheight_, unichar_id_,
              (unicharset == NULL) ? "" :
              unicharset->debug_str(unichar_id_).string());
    }
+    void print_full() const {
+      print(NULL);
+      tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n",
+              script_id_, fontinfo_id_, fontinfo_id2_, yshift_, classifier_);
+    }
+    // Sort function for sorting BLOB_CHOICEs in increasing order of rating.
+    static int SortByRating(const void *p1, const void *p2) {
+      const BLOB_CHOICE *bc1 =
+          *reinterpret_cast<const BLOB_CHOICE * const *>(p1);
+      const BLOB_CHOICE *bc2 =
+          *reinterpret_cast<const BLOB_CHOICE * const *>(p2);
+      return (bc1->rating_ < bc2->rating_) ? -1 : 1;
+    }

 private:
  UNICHAR_ID unichar_id_;          // unichar id
@ -149,21 +200,26 @@ class BLOB_CHOICE: public ELIST_LINK
  // k is defined as above to normalize -klog p to the range [0, 1].
  float certainty_;               // absolute
  int script_id_;
-  // Stores language model information about this BLOB_CHOICE. Used during
-  // the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
-  // recorded in the ratings matrix.
-  // The pointer is owned/managed by the segmentation search.
-  void *language_model_state_;
+  // Holds the position of this choice in the ratings matrix.
+  // Used to location position in the matrix during path backtracking.
+  MATRIX_COORD matrix_cell_;
  inT16 xgap_before_;
  inT16 xgap_after_;
  // X-height range (in image pixels) that this classification supports.
-  inT16 min_xheight_;
-  inT16 max_xheight_;
-  bool adapted_;  // true if this is a match from adapted templates
+  float min_xheight_;
+  float max_xheight_;
+  // yshift_ - The vertical distance (in image pixels) the character is
+  //           shifted (up or down) from an acceptable y position.
+  float yshift_;
+  BlobChoiceClassifier classifier_;  // What generated *this.
 };

 // Make BLOB_CHOICE listable.
-ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
+ELISTIZEH(BLOB_CHOICE)
+
+// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
+// or NULL if there is no match.
+BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);

 // Permuter codes used in WERD_CHOICEs.
 enum PermuterType {
@ -180,11 +236,27 @@ enum PermuterType {
  USER_DAWG_PERM,     // 10
  FREQ_DAWG_PERM,     // 11
  COMPOUND_PERM,      // 12
+
+  NUM_PERMUTER_TYPES
 };

-class WERD_CHOICE {
+namespace tesseract {
+// ScriptPos tells whether a character is subscript, superscript or normal.
+enum ScriptPos {
+  SP_NORMAL,
+  SP_SUBSCRIPT,
+  SP_SUPERSCRIPT,
+  SP_DROPCAP
+};
+
+const char *ScriptPosToString(tesseract::ScriptPos script_pos);
+
+}  // namespace tesseract.
+
+class WERD_CHOICE : public ELIST_LINK {
 public:
  static const float kBadRating;
+  static const char *permuter_name(uinT8 permuter);

  WERD_CHOICE(const UNICHARSET *unicharset)
    : unicharset_(unicharset) { this->init(8); }
@ -213,6 +285,12 @@ class WERD_CHOICE {
  inline int length() const {
    return length_;
  }
+  float adjust_factor() const {
+    return adjust_factor_;
+  }
+  void set_adjust_factor(float factor) {
+    adjust_factor_ = factor;
+  }
  inline const UNICHAR_ID *unichar_ids() const {
    return unichar_ids_;
  }
@ -220,12 +298,13 @@ class WERD_CHOICE {
    assert(index < length_);
    return unichar_ids_[index];
  }
-  inline const char *fragment_lengths() const {
-    return fragment_lengths_;
+  inline int state(int index) const {
+    return state_[index];
  }
-  inline const char fragment_length(int index) const {
-    assert(index < length_);
-    return fragment_lengths_[index];
+  tesseract::ScriptPos BlobPosition(int index) const {
+    if (index < 0 || index >= length_)
+      return tesseract::SP_NORMAL;
+    return script_pos_[index];
  }
  inline float rating() const {
    return rating_;
@ -233,23 +312,41 @@ class WERD_CHOICE {
  inline float certainty() const {
    return certainty_;
  }
+  inline float certainty(int index) const {
+    return certainties_[index];
+  }
+  inline float min_x_height() const {
+    return min_x_height_;
+  }
+  inline float max_x_height() const {
+    return max_x_height_;
+  }
+  inline void set_x_heights(float min_height, float max_height) {
+    min_x_height_ = min_height;
+    max_x_height_ = max_height;
+  }
  inline uinT8 permuter() const {
    return permuter_;
  }
  const char *permuter_name() const;
-  inline bool fragment_mark() const {
-    return fragment_mark_;
-  }
-  inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
-    return blob_choices_;
-  }
+  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
+  // taken from the appropriate cell in the ratings MATRIX.
+  // Borrowed pointer, so do not delete.
+  BLOB_CHOICE_LIST* blob_choices(int index, MATRIX* ratings) const;
+
+  // Returns the MATRIX_COORD corresponding to the location in the ratings
+  // MATRIX for the given index into the word.
+  MATRIX_COORD MatrixCoord(int index) const;
+
  inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
    assert(index < length_);
    unichar_ids_[index] = unichar_id;
  }
-  inline void set_fragment_length(char flen, int index) {
-    assert(index < length_);
-    fragment_lengths_[index] = flen;
+  bool dangerous_ambig_found() const {
+    return dangerous_ambig_found_;
+  }
+  void set_dangerous_ambig_found_(bool value) {
+    dangerous_ambig_found_ = value;
  }
  inline void set_rating(float new_val) {
    rating_ = new_val;
@ -260,9 +357,6 @@ class WERD_CHOICE {
  inline void set_permuter(uinT8 perm) {
    permuter_ = perm;
  }
-  inline void set_fragment_mark(bool new_fragment_mark) {
-    fragment_mark_ = new_fragment_mark;
-  }
  // Note: this function should only be used if all the fields
  // are populated manually with set_* functions (rather than
  // (copy)constructors and append_* functions).
@ -270,19 +364,24 @@ class WERD_CHOICE {
    ASSERT_HOST(reserved_ >= len);
    length_ = len;
  }
-  void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);

  /// Make more space in unichar_id_ and fragment_lengths_ arrays.
  inline void double_the_size() {
    if (reserved_ > 0) {
      unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
          reserved_, unichar_ids_);
-      fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
-          reserved_, fragment_lengths_);
+      script_pos_ = GenericVector<tesseract::ScriptPos>::double_the_size_memcpy(
+          reserved_, script_pos_);
+      state_ = GenericVector<int>::double_the_size_memcpy(
+          reserved_, state_);
+      certainties_ = GenericVector<float>::double_the_size_memcpy(
+          reserved_, certainties_);
      reserved_ *= 2;
    } else {
      unichar_ids_ = new UNICHAR_ID[1];
-      fragment_lengths_ = new char[1];
+      script_pos_ = new tesseract::ScriptPos[1];
+      state_ = new int[1];
+      certainties_ = new float[1];
      reserved_ = 1;
    }
  }
@ -293,18 +392,24 @@ class WERD_CHOICE {
    reserved_ = reserved;
    if (reserved > 0) {
      unichar_ids_ = new UNICHAR_ID[reserved];
-      fragment_lengths_ = new char[reserved];
+      script_pos_ = new tesseract::ScriptPos[reserved];
+      state_ = new int[reserved];
+      certainties_ = new float[reserved];
    } else {
      unichar_ids_ = NULL;
-      fragment_lengths_ = NULL;
+      script_pos_ = NULL;
+      state_ = NULL;
+      certainties_ = NULL;
    }
    length_ = 0;
+    adjust_factor_ = 1.0f;
    rating_ = 0.0;
    certainty_ = MAX_FLOAT32;
+    min_x_height_ = 0.0f;
+    max_x_height_ = MAX_FLOAT32;
    permuter_ = NO_PERM;
-    fragment_mark_ = false;
-    blob_choices_ = NULL;
    unichars_in_script_order_ = false;  // Tesseract is strict left-to-right.
+    dangerous_ambig_found_ = false;
  }

  /// Helper function to build a WERD_CHOICE from the given string,
@ -321,34 +426,39 @@ class WERD_CHOICE {
    length_ = 0;
    rating_ = kBadRating;
    certainty_ = -MAX_FLOAT32;
-    fragment_mark_ = false;
  }

  /// This function assumes that there is enough space reserved
  /// in the WERD_CHOICE for adding another unichar.
  /// This is an efficient alternative to append_unichar_id().
  inline void append_unichar_id_space_allocated(
-      UNICHAR_ID unichar_id, char fragment_length,
+      UNICHAR_ID unichar_id, int blob_count,
      float rating, float certainty) {
    assert(reserved_ > length_);
    length_++;
-    this->set_unichar_id(unichar_id, fragment_length,
+    this->set_unichar_id(unichar_id, blob_count,
                         rating, certainty, length_-1);
  }

-  void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
+  void append_unichar_id(UNICHAR_ID unichar_id, int blob_count,
                         float rating, float certainty);

-  inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
+  inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count,
                             float rating, float certainty, int index) {
    assert(index < length_);
    unichar_ids_[index] = unichar_id;
-    fragment_lengths_[index] = fragment_length;
+    state_[index] = blob_count;
+    certainties_[index] = certainty;
+    script_pos_[index] = tesseract::SP_NORMAL;
    rating_ += rating;
    if (certainty < certainty_) {
      certainty_ = certainty;
    }
  }
+  // Sets the entries for the given index from the BLOB_CHOICE, assuming
+  // unit fragment lengths, but setting the state for this index to blob_count.
+  void set_blob_choice(int index, int blob_count,
+                       const BLOB_CHOICE* blob_choice);

  bool contains_unichar_id(UNICHAR_ID unichar_id) const;
  void remove_unichar_ids(int index, int num);
@ -364,6 +474,11 @@ class WERD_CHOICE {
  // punctuation from the left and right.
  void punct_stripped(int *start_core, int *end_core) const;

+  // Returns the indices [start, end) containing the core of the word, stripped
+  // of any superscript digits on either side. (i.e., the non-footnote part
+  // of the word). There is no guarantee that the output range is non-empty.
+  void GetNonSuperscriptSpan(int *start, int *end) const;
+
  // Return a copy of this WERD_CHOICE with the choices [start, end).
  // The result is useful only for checking against a dictionary.
  WERD_CHOICE shallow_copy(int start, int end) const;
@ -402,8 +517,42 @@ class WERD_CHOICE {
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
    return unichar_lengths_;
  }
-  const void print() const { this->print(""); }
-  const void print(const char *msg) const;
+
+  // Sets up the script_pos_ member using the blobs_list to get the bln
+  // bounding boxes, *this to get the unichars, and this->unicharset
+  // to get the target positions. If small_caps is true, sub/super are not
+  // considered, but dropcaps are.
+  // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
+  void SetScriptPositions(bool small_caps, TWERD* word);
+  // Sets the script_pos_ member from some source positions with a given length.
+  void SetScriptPositions(const tesseract::ScriptPos* positions, int length);
+  // Sets all the script_pos_ positions to the given position.
+  void SetAllScriptPositions(tesseract::ScriptPos position);
+
+  static tesseract::ScriptPos ScriptPositionOf(bool print_debug,
+                                               const UNICHARSET& unicharset,
+                                               const TBOX& blob_box,
+                                               UNICHAR_ID unichar_id);
+
+  // Returns the "dominant" script ID for the word.  By "dominant", the script
+  // must account for at least half the characters.  Otherwise, it returns 0.
+  // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
+  int GetTopScriptID() const;
+
+  // Fixes the state_ for a chop at the given blob_posiiton.
+  void UpdateStateForSplit(int blob_position);
+
+  // Returns the sum of all the state elements, being the total number of blobs.
+  int TotalOfStates() const;
+
+  void print() const { this->print(""); }
+  void print(const char *msg) const;
+  // Prints the segmentation state with an introductory message.
+  void print_state(const char *msg) const;
+
+  // Displays the segmentation state of *this (if not the same as the last
+  // one displayed) and waits for a click in the window.
+  void DisplaySegmentation(TWERD* word);

  WERD_CHOICE& operator+= (     // concatanate
    const WERD_CHOICE & second);// second on first
@ -412,41 +561,55 @@ class WERD_CHOICE {

 private:
  const UNICHARSET *unicharset_;
+  // TODO(rays) Perhaps replace the multiple arrays with an array of structs?
+  // unichar_ids_ is an array of classifier "results" that make up a word.
+  // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
+  // of each unichar_id.
+  // state_[i] indicates the number of blobs in WERD_RES::chopped_word that
+  // were put together to make the classification results in the ith position
+  // in unichar_ids_, and certainties_[i] is the certainty of the choice that
+  // was used in this word.
+  // == Change from before ==
+  // Previously there was fragment_lengths_ that allowed a word to be
+  // artificially composed of multiple fragment results. Since the new
+  // segmentation search doesn't do fragments, treatment of fragments has
+  // been moved to a lower level, augmenting the ratings matrix with the
+  // combined fragments, and allowing the language-model/segmentation-search
+  // to deal with only the combined unichar_ids.
  UNICHAR_ID *unichar_ids_;  // unichar ids that represent the text of the word
-  char *fragment_lengths_;   // number of fragments in each unichar
+  tesseract::ScriptPos* script_pos_;  // Normal/Sub/Superscript of each unichar.
+  int* state_;               // Number of blobs in each unichar.
+  float* certainties_;       // Certainty of each unichar.
  int reserved_;             // size of the above arrays
  int length_;               // word length
+  // Factor that was used to adjust the rating.
+  float adjust_factor_;
  // Rating is the sum of the ratings of the individual blobs in the word.
  float rating_;             // size related
  // certainty is the min (worst) certainty of the individual blobs in the word.
  float certainty_;          // absolute
+  // xheight computed from the result, or 0 if inconsistent.
+  float min_x_height_;
+  float max_x_height_;
  uinT8 permuter_;           // permuter code
-  bool fragment_mark_;       // if true, indicates that this choice
-                             // was chosen over a better one that
-                             // contained a fragment
-  BLOB_CHOICE_LIST_CLIST *blob_choices_;  // best choices for each blob

-  // Normally, the blob_choices_ represent the recognition results in order
+  // Normally, the ratings_ matrix represents the recognition results in order
  // from left-to-right.  However, some engines (say Cube) may return
  // recognition results in the order of the script's major reading direction
  // (for Arabic, that is right-to-left).
  bool unichars_in_script_order_;
+  // True if NoDangerousAmbig found an ambiguity.
+  bool dangerous_ambig_found_;

  // The following variables are populated and passed by reference any
  // time unichar_string() or unichar_lengths() are called.
  mutable STRING unichar_string_;
  mutable STRING unichar_lengths_;
-
-  bool unichar_info_present;
-
- private:
-  void delete_blob_choices();
 };

 // Make WERD_CHOICE listable.
-ELISTIZEH (WERD_CHOICE)
+ELISTIZEH(WERD_CHOICE)
 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
-typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;

 // Utilities for comparing WERD_CHOICEs

@ -454,27 +617,11 @@ bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
                                       const WERD_CHOICE &word2);

 // Utilities for debug printing.
-void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
 void print_ratings_list(
    const char *msg,                      // intro message
    BLOB_CHOICE_LIST *ratings,            // list of results
    const UNICHARSET &current_unicharset  // unicharset that can be used
                                          // for id-to-unichar conversion
    );
-void print_ratings_info(
-    FILE *fp,                             // file to use
-    BLOB_CHOICE_LIST *ratings,            // list of results
-    const UNICHARSET &current_unicharset  // unicharset that can be used
-                                          // for id-to-unichar conversion
-    );
-void print_char_choices_list(
-    const char *msg,
-    const BLOB_CHOICE_LIST_VECTOR &char_choices,
-    const UNICHARSET &current_unicharset,
-    BOOL8 detailed
-    );
-void print_word_alternates_list(
-    WERD_CHOICE *word,
-    GenericVector<WERD_CHOICE *> *alternates);

 #endif
--- a/ccstruct/rect.cpp
+++ b/ccstruct/rect.cpp
@ -171,6 +171,16 @@ void TBOX::plot(                      //paint box
 }
 #endif

+// Appends the bounding box as (%d,%d)->(%d,%d) to a STRING.
+void TBOX::print_to_str(STRING *str) const {
+  // "(%d,%d)->(%d,%d)", left(), bottom(), right(), top()
+  str->add_str_int("(", left());
+  str->add_str_int(",", bottom());
+  str->add_str_int(")->(", right());
+  str->add_str_int(",", top());
+  *str += ')';
+}
+
 // Writes to the given file. Returns false in case of error.
 bool TBOX::Serialize(FILE* fp) const {
  if (!bot_left.Serialize(fp)) return false;
--- a/ccstruct/rect.h
+++ b/ccstruct/rect.h
@ -24,6 +24,7 @@
 #include "points.h"
 #include "ndminx.h"
 #include "scrollview.h"
+#include "strngs.h"
 #include "tprintf.h"

 class DLLSYM TBOX  {  // bounding box
@ -264,15 +265,8 @@ class DLLSYM TBOX  {  // bounding box
      tprintf("Bounding box=(%d,%d)->(%d,%d)\n",
              left(), bottom(), right(), top());
    }
-
-    // Same as print(), but appends debug information to the given string
-    // instead of printing it to stdout.
-    void append_debug(STRING *str) const {
-      char buffer[256];
-      sprintf(buffer, "Bounding box=(%d,%d)->(%d,%d)\n",
-              left(), bottom(), right(), top());
-      *str += buffer;
-    }
+    // Appends the bounding box as (%d,%d)->(%d,%d) to a STRING.
+    void print_to_str(STRING *str) const;

 #ifndef GRAPHICS_DISABLED
    void plot(                    // use current settings
--- a/ccstruct/seam.cpp
+++ b/ccstruct/seam.cpp
@ -27,8 +27,8 @@
 ----------------------------------------------------------------------*/
 #include "seam.h"
 #include "blobs.h"
-#include "callcpp.h"
-#include "structures.h"
+#include "freelist.h"
+#include "tprintf.h"

 #ifdef __UNIX__
 #include <assert.h>
@ -38,7 +38,6 @@
              V a r i a b l e s
 ----------------------------------------------------------------------*/
 #define NUM_STARTING_SEAMS  20
-makestructure(newseam, free_seam, SEAM);

 /*----------------------------------------------------------------------
        Public Function Code
@ -66,7 +65,7 @@ bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2) {
 * seam.
 * @returns TRUE if one of them is.
 */
-bool point_in_seam(SEAM *seam, SPLIT *split) {
+bool point_in_seam(const SEAM *seam, SPLIT *split) {
  return (point_in_split(seam->split1, split->point1, split->point2) ||
          point_in_split(seam->split2, split->point1, split->point2) ||
          point_in_split(seam->split3, split->point1, split->point2));
@ -96,16 +95,6 @@ bool point_used_by_seam(SEAM *seam, EDGEPT *point) {
      point_used_by_split(seam->split3, point);
 }

-/**
- * @name add_seam
- *
- * Add another seam to a collection of seams.
- */
-SEAMS add_seam(SEAMS seam_list, SEAM *seam) {
-  return (array_push (seam_list, seam));
-}
-
-
 /**
 * @name combine_seam
 *
@ -126,7 +115,8 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
    else if (!dest_seam->split3)
      dest_seam->split3 = source_seam->split1;
    else
-      cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
+      delete source_seam->split1;  // Wouldn't have fitted.
+    source_seam->split1 = NULL;
  }
  if (source_seam->split2) {
    if (!dest_seam->split2)
@ -134,35 +124,17 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
    else if (!dest_seam->split3)
      dest_seam->split3 = source_seam->split2;
    else
-      cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
+      delete source_seam->split2;  // Wouldn't have fitted.
+    source_seam->split2 = NULL;
  }
  if (source_seam->split3) {
    if (!dest_seam->split3)
      dest_seam->split3 = source_seam->split3;
    else
-      cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
-  }
-  free_seam(source_seam);
-}
-
-
-/**
- * @name delete_seam
- *
- * Free this seam record and the splits that are attached to it.
- */
-void delete_seam(void *arg) {  //SEAM  *seam)
-  SEAM *seam = (SEAM *) arg;
-
-  if (seam) {
-    if (seam->split1)
-      delete_split(seam->split1);
-    if (seam->split2)
-      delete_split(seam->split2);
-    if (seam->split3)
-      delete_split(seam->split3);
-    free_seam(seam);
+      delete source_seam->split3;  // Wouldn't have fitted.
+    source_seam->split3 = NULL;
  }
+  delete source_seam;
 }

 /**
@ -172,36 +144,17 @@ void delete_seam(void *arg) {  //SEAM  *seam)
 * present in the starting segmentation.  Each of the seams created
 * by this routine have location information only.
 */
-SEAMS start_seam_list(TBLOB *blobs) {
-  TBLOB *blob;
-  SEAMS seam_list;
+void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array) {
+  seam_array->truncate(0);
  TPOINT location;
-  /* Seam slot per char */
-  seam_list = new_seam_list ();

-  for (blob = blobs; blob->next != NULL; blob = blob->next) {
-    TBOX bbox = blob->bounding_box();
-    TBOX nbox = blob->next->bounding_box();
+  for (int b = 1; b < word->NumBlobs(); ++b) {
+    TBOX bbox = word->blobs[b - 1]->bounding_box();
+    TBOX nbox = word->blobs[b]->bounding_box();
    location.x = (bbox.right() + nbox.left()) / 2;
    location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;
-    seam_list = add_seam(seam_list,
-        new_seam(0.0, location, NULL, NULL, NULL));
+    seam_array->push_back(new SEAM(0.0f, location, NULL, NULL, NULL));
  }
-
-  return seam_list;
-}
-
-/**
- * @name free_seam_list
- *
- * Free all the seams that have been allocated in this list.  Reclaim
- * the memory for each of the splits as well.
- */
-void free_seam_list(SEAMS seam_list) {
-  int x;
-
-  array_loop(seam_list, x) delete_seam(array_value (seam_list, x));
-  array_free(seam_list);
 }


@ -210,32 +163,26 @@ void free_seam_list(SEAMS seam_list) {
 *
 * @returns true if insert_seam will succeed.
 */
-bool test_insert_seam(SEAMS seam_list,
-                      int index,
-                      TBLOB *left_blob,
-                      TBLOB *first_blob) {
+bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
+                      TWERD *word, int index) {
  SEAM *test_seam;
  TBLOB *blob;
  int test_index;
  int list_length;

-  list_length = array_count (seam_list);
-  for (test_index=0, blob=first_blob->next;
-       test_index < index;
-       test_index++, blob=blob->next) {
-    test_seam = (SEAM *) array_value(seam_list, test_index);
+  list_length = seam_array.size();
+  for (int test_index = 0; test_index < index; ++test_index) {
+    test_seam = seam_array[test_index];
    if (test_index + test_seam->widthp < index &&
        test_seam->widthp + test_index == index - 1 &&
-        account_splits_right(test_seam, blob) < 0)
+        account_splits(test_seam, word, test_index + 1, 1) < 0)
      return false;
  }
-  for (test_index=index, blob=left_blob->next;
-       test_index < list_length;
-       test_index++, blob=blob->next) {
-    test_seam = (SEAM *) array_value(seam_list, test_index);
+  for (int test_index = index; test_index < list_length; test_index++) {
+    test_seam = seam_array[test_index];
    if (test_index - test_seam->widthn >= index &&
        test_index - test_seam->widthn == index &&
-        account_splits_left(test_seam, first_blob, blob) < 0)
+        account_splits(test_seam, word, test_index + 1, -1) < 0)
      return false;
  }
  return true;
@ -247,58 +194,51 @@ bool test_insert_seam(SEAMS seam_list,
 * Add another seam to a collection of seams at a particular location
 * in the seam array.
 */
-SEAMS insert_seam(SEAMS seam_list,
-                  int index,
-                  SEAM *seam,
-                  TBLOB *left_blob,
-                  TBLOB *first_blob) {
+void insert_seam(const TWERD* word, int index, SEAM *seam,
+                 GenericVector<SEAM*>* seam_array) {
  SEAM *test_seam;
-  TBLOB *blob;
  int test_index;
  int list_length;

-  list_length = array_count(seam_list);
-  for (test_index=0, blob=first_blob->next;
-       test_index < index;
-       test_index++, blob=blob->next) {
-    test_seam = (SEAM *) array_value(seam_list, test_index);
+  list_length = seam_array->size();
+  for (int test_index = 0; test_index < index; ++test_index) {
+    test_seam = seam_array->get(test_index);
    if (test_index + test_seam->widthp >= index) {
      test_seam->widthp++;       /*got in the way */
    } else if (test_seam->widthp + test_index == index - 1) {
-      test_seam->widthp = account_splits_right(test_seam, blob);
+      test_seam->widthp = account_splits(test_seam, word, test_index + 1, 1);
      if (test_seam->widthp < 0) {
-        cprintf("Failed to find any right blob for a split!\n");
+        tprintf("Failed to find any right blob for a split!\n");
        print_seam("New dud seam", seam);
        print_seam("Failed seam", test_seam);
      }
    }
  }
-  for (test_index=index, blob=left_blob->next;
-       test_index < list_length;
-       test_index++, blob=blob->next) {
-    test_seam = (SEAM *) array_value(seam_list, test_index);
+  for (int test_index = index; test_index < list_length; test_index++) {
+    test_seam = seam_array->get(test_index);
    if (test_index - test_seam->widthn < index) {
      test_seam->widthn++;       /*got in the way */
    } else if (test_index - test_seam->widthn == index) {
-      test_seam->widthn = account_splits_left(test_seam, first_blob, blob);
+      test_seam->widthn = account_splits(test_seam, word, test_index + 1, -1);
      if (test_seam->widthn < 0) {
-        cprintf("Failed to find any left blob for a split!\n");
+        tprintf("Failed to find any left blob for a split!\n");
        print_seam("New dud seam", seam);
        print_seam("Failed seam", test_seam);
      }
    }
  }
-  return (array_insert (seam_list, index, seam));
+  seam_array->insert(seam, index);
 }


 /**
- * @name account_splits_right
+ * @name account_splits
 *
- * Account for all the splits by looking to the right.
- * in the blob list.
+ * Account for all the splits by looking to the right (blob_direction == 1),
+ * or to the left (blob_direction == -1) in the word.
 */
-int account_splits_right(SEAM *seam, TBLOB *blob) {
+int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
+                   int blob_direction) {
  inT8 found_em[3];
  inT8 width;

@ -309,6 +249,7 @@ int account_splits_right(SEAM *seam, TBLOB *blob) {
    return 0;
  width = 0;
  do {
+    TBLOB* blob = word->blobs[blob_index];
    if (!found_em[0])
      found_em[0] = find_split_in_blob(seam->split1, blob);
    if (!found_em[1])
@ -319,54 +260,12 @@ int account_splits_right(SEAM *seam, TBLOB *blob) {
      return width;
    }
    width++;
-    blob = blob->next;
-  } while (blob != NULL);
+    blob_index += blob_direction;
+  } while (0 <= blob_index && blob_index < word->NumBlobs());
  return -1;
 }


-/**
- * @name account_splits_left
- *
- * Account for all the splits by looking to the left.
- * in the blob list.
- */
-int account_splits_left(SEAM *seam, TBLOB *blob, TBLOB *end_blob) {
-  inT32 depth = 0;
-  inT8 width = 0;
-  inT8 found_em[3];
-  account_splits_left_helper(seam, blob, end_blob, &depth, &width, found_em);
-  return width;
-}
-
-void account_splits_left_helper(SEAM *seam, TBLOB *blob, TBLOB *end_blob,
-                                inT32 *depth, inT8 *width, inT8* found_em) {
-  if (blob != end_blob) {
-    (*depth)++;
-    account_splits_left_helper(seam, blob->next, end_blob,
-                               depth, width, found_em);
-    (*depth)--;
-  } else {
-    found_em[0] = seam->split1 == NULL;
-    found_em[1] = seam->split2 == NULL;
-    found_em[2] = seam->split3 == NULL;
-    *width = 0;
-  }
-  if (!found_em[0])
-    found_em[0] = find_split_in_blob(seam->split1, blob);
-  if (!found_em[1])
-    found_em[1] = find_split_in_blob(seam->split2, blob);
-  if (!found_em[2])
-    found_em[2] = find_split_in_blob(seam->split3, blob);
-  if (!found_em[0] || !found_em[1] || !found_em[2]) {
-    (*width)++;
-    if (*depth == 0) {
-      *width = -1;
-    }
-  }
-}
-
-
 /**
 * @name find_split_in_blob
 *
@ -393,7 +292,7 @@ bool find_split_in_blob(SPLIT *split, TBLOB *blob) {
 * Merge these two seams into a new seam.  Duplicate the split records
 * in both of the input seams.  Return the resultant seam.
 */
-SEAM *join_two_seams(SEAM *seam1, SEAM *seam2) {
+SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2) {
  SEAM *result = NULL;
  SEAM *temp;

@ -403,52 +302,13 @@ SEAM *join_two_seams(SEAM *seam1, SEAM *seam2) {
       (seam1->split2 == NULL && seam2->split3 == NULL) ||
        seam1->split1 == NULL || seam2->split1 == NULL) &&
      (!shared_split_points(seam1, seam2))) {
-    clone_seam(result, seam1);
-    clone_seam(temp, seam2);
+    result = new SEAM(*seam1);
+    temp = new SEAM(*seam2);
    combine_seams(result, temp);
  }
  return (result);
 }

-
-/**
- * @name new_seam
- *
- * Create a structure for a "seam" between two blobs.  This data
- * structure may actually hold up to three different splits.
- * Initailization of this record is done by this routine.
- */
-SEAM *new_seam(PRIORITY priority,
-               const TPOINT& location,
-               SPLIT *split1,
-               SPLIT *split2,
-               SPLIT *split3) {
-  SEAM *seam;
-
-  seam = newseam ();
-
-  seam->priority = priority;
-  seam->location = location;
-  seam->widthp = 0;
-  seam->widthn = 0;
-  seam->split1 = split1;
-  seam->split2 = split2;
-  seam->split3 = split3;
-
-  return (seam);
-}
-
-
-/**
- * @name new_seam_list
- *
- * Create a collection of seam records in an array.
- */
-SEAMS new_seam_list() {
-  return (array_new (NUM_STARTING_SEAMS));
-}
-
-
 /**
 * @name print_seam
 *
@ -457,21 +317,21 @@ SEAMS new_seam_list() {
 */
 void print_seam(const char *label, SEAM *seam) {
  if (seam) {
-    cprintf(label);
-    cprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
+    tprintf(label);
+    tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
            seam->priority, seam->location.x, seam->location.y,
            seam->widthp, seam->widthn);
    print_split(seam->split1);

    if (seam->split2) {
-      cprintf(",   ");
+      tprintf(",   ");
      print_split (seam->split2);
      if (seam->split3) {
-        cprintf(",   ");
+        tprintf(",   ");
        print_split (seam->split3);
      }
    }
-    cprintf ("\n");
+    tprintf("\n");
  }
 }

@ -482,17 +342,16 @@ void print_seam(const char *label, SEAM *seam) {
 * Print a list of splits.  Show the coordinates of both points in
 * each split.
 */
-void print_seams(const char *label, SEAMS seams) {
-  int x;
+void print_seams(const char *label, const GenericVector<SEAM*>& seams) {
  char number[CHARS_PER_LINE];

-  if (seams) {
-    cprintf("%s\n", label);
-    array_loop(seams, x) {
+  if (!seams.empty()) {
+    tprintf("%s\n", label);
+    for (int x = 0; x < seams.size(); ++x) {
      sprintf(number, "%2d:   ", x);
-      print_seam(number, (SEAM *) array_value(seams, x));
+      print_seam(number, seams[x]);
    }
-    cprintf("\n");
+    tprintf("\n");
  }
 }

@ -504,7 +363,7 @@ void print_seams(const char *label, SEAMS seams) {
 * points in common. Return TRUE if any of the same points are present
 * in any of the splits of both seams.
 */
-int shared_split_points(SEAM *seam1, SEAM *seam2) {
+int shared_split_points(const SEAM *seam1, const SEAM *seam2) {
  if (seam1 == NULL || seam2 == NULL)
    return (FALSE);

@ -532,23 +391,20 @@ int shared_split_points(SEAM *seam1, SEAM *seam2) {
 * Break up the blobs in this chain so that they are all independent.
 * This operation should undo the affect of join_pieces.
 **********************************************************************/
-void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end) {
-  TESSLINE *outline = blobs->outlines;
-  TBLOB *next_blob;
-  inT16 x;
+void break_pieces(const GenericVector<SEAM*>& seams, int first, int last,
+                  TWERD *word) {
+  for (int x = first; x < last; ++x)
+    reveal_seam(seams[x]);

-  for (x = start; x < end; x++)
-    reveal_seam ((SEAM *) array_value (seams, x));
+  TESSLINE *outline = word->blobs[first]->outlines;
+  int next_blob = first + 1;

-  next_blob = blobs->next;
-
-  while (outline && next_blob) {
-    if (outline->next == next_blob->outlines) {
+  while (outline != NULL && next_blob <= last) {
+    if (outline->next == word->blobs[next_blob]->outlines) {
      outline->next = NULL;
-      outline = next_blob->outlines;
-      next_blob = next_blob->next;
-    }
-    else {
+      outline = word->blobs[next_blob]->outlines;
+      ++next_blob;
+    } else {
      outline = outline->next;
    }
  }
@ -561,30 +417,19 @@ void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end) {
 * Join a group of base level pieces into a single blob that can then
 * be classified.
 **********************************************************************/
-void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end) {
-  TBLOB *next_blob;
-  TBLOB *blob;
-  inT16 x;
-  TESSLINE *outline;
-  SEAM *seam;
-
-  for (x = 0, blob = piece_blobs; x < start; x++)
-    blob = blob->next;
-  next_blob = blob->next;
-  outline = blob->outlines;
+void join_pieces(const GenericVector<SEAM*>& seams, int first, int last,
+                 TWERD *word) {
+  TESSLINE *outline = word->blobs[first]->outlines;
  if (!outline)
    return;

-  while (x < end) {
-    seam = (SEAM *) array_value (seams, x);
-    if (x - seam->widthn >= start && x + seam->widthp < end)
+  for (int x = first; x < last; ++x) {
+    SEAM *seam = seams[x];
+    if (x - seam->widthn >= first && x + seam->widthp < last)
      hide_seam(seam);
    while (outline->next)
      outline = outline->next;
-    outline->next = next_blob->outlines;
-    next_blob = next_blob->next;
-
-    x++;
+    outline->next = word->blobs[x + 1]->outlines;
  }
 }

@ -626,7 +471,7 @@ void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
  }
  while (!exact_point (edgept, pt2) && edgept != pt1);
  if (edgept == pt1) {
-    /*              cprintf("Hid entire outline at (%d,%d)!!\n",
+    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
       edgept->pos.x,edgept->pos.y);                                */
  }
  edgept = pt2;
@ -636,7 +481,7 @@ void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
  }
  while (!exact_point (edgept, pt1) && edgept != pt2);
  if (edgept == pt2) {
-    /*              cprintf("Hid entire outline at (%d,%d)!!\n",
+    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
       edgept->pos.x,edgept->pos.y);                                */
  }
 }
@ -679,7 +524,7 @@ void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
  }
  while (!exact_point (edgept, pt2) && edgept != pt1);
  if (edgept == pt1) {
-    /*              cprintf("Hid entire outline at (%d,%d)!!\n",
+    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
       edgept->pos.x,edgept->pos.y);                                */
  }
  edgept = pt2;
@ -689,7 +534,7 @@ void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
  }
  while (!exact_point (edgept, pt1) && edgept != pt2);
  if (edgept == pt2) {
-    /*              cprintf("Hid entire outline at (%d,%d)!!\n",
+    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
       edgept->pos.x,edgept->pos.y);                                */
  }
 }
--- a/ccstruct/seam.h
+++ b/ccstruct/seam.h
@ -30,15 +30,36 @@
 ----------------------------------------------------------------------*/
 #include "blobs.h"
 #include "split.h"
-#include "tessarray.h"

 /*----------------------------------------------------------------------
              T y p e s
 ----------------------------------------------------------------------*/
 typedef float PRIORITY;          /*  PRIORITY  */

-typedef struct seam_record
-{                                /*  SEAM  */
+struct SEAM {
+  // Constructor that was formerly new_seam.
+  SEAM(PRIORITY priority0, const TPOINT& location0,
+       SPLIT *splita, SPLIT *splitb, SPLIT *splitc)
+  : priority(priority0), widthp(0), widthn(0), location(location0),
+    split1(splita), split2(splitb), split3(splitc) {}
+  // Copy constructor that was formerly clone_seam.
+  SEAM(const SEAM& src)
+  : priority(src.priority), widthp(src.widthp), widthn(src.widthn),
+    location(src.location) {
+    clone_split(split1, src.split1);
+    clone_split(split2, src.split2);
+    clone_split(split3, src.split3);
+  }
+  // Destructor was delete_seam.
+  ~SEAM() {
+    if (split1)
+      delete_split(split1);
+    if (split2)
+      delete_split(split2);
+    if (split3)
+      delete_split(split3);
+  }
+
  PRIORITY priority;
  inT8 widthp;
  inT8 widthn;
@ -46,36 +67,7 @@ typedef struct seam_record
  SPLIT *split1;
  SPLIT *split2;
  SPLIT *split3;
-} SEAM;
-
-typedef ARRAY SEAMS;             /*  SEAMS  */
-
-extern SEAM *newseam();
-
-/*----------------------------------------------------------------------
-              M a c r o s
----------------------------------------------------------------------*/
-/**
- * @name clone_seam
- *
- * Create a new seam record and copy the contents of this seam into it.
- */
-
-#define clone_seam(dest,source)                    \
-if (source) {                                      \
-  (dest) = newseam ();                             \
-  (dest)->location = (source)->location;           \
-  (dest)->widthp = (source)->widthp;               \
-  (dest)->widthn = (source)->widthn;               \
-  (dest)->priority = (source)->priority;           \
-  clone_split ((dest)->split1, (source)->split1);  \
-  clone_split ((dest)->split2, (source)->split2);  \
-  clone_split ((dest)->split3, (source)->split3);  \
-}                                                  \
-else {                                             \
-  (dest) = (SEAM*) NULL;                           \
-}                                                  \
-
+};

 /**
 * exact_point
@ -92,61 +84,40 @@ else {                                             \
 ----------------------------------------------------------------------*/
 bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2);

-bool point_in_seam(SEAM *seam, SPLIT *split);
+bool point_in_seam(const SEAM *seam, SPLIT *split);

 bool point_used_by_split(SPLIT *split, EDGEPT *point);

 bool point_used_by_seam(SEAM *seam, EDGEPT *point);

-SEAMS add_seam(SEAMS seam_list, SEAM *seam);
-
 void combine_seams(SEAM *dest_seam, SEAM *source_seam);

-void delete_seam(void *arg);  //SEAM  *seam);
+void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array);

-SEAMS start_seam_list(TBLOB *blobs);
+bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
+                      TWERD *word, int index);

-void free_seam_list(SEAMS seam_list);
+void insert_seam(const TWERD *word, int index, SEAM *seam,
+                 GenericVector<SEAM*>* seam_array);

-bool test_insert_seam(SEAMS seam_list,
-                      int index,
-                      TBLOB *left_blob,
-                      TBLOB *first_blob);
-
-SEAMS insert_seam(SEAMS seam_list,
-                  int index,
-                  SEAM *seam,
-                  TBLOB *left_blob,
-                  TBLOB *first_blob);
-
-int account_splits_right(SEAM *seam, TBLOB *blob);
-
-int account_splits_left(SEAM *seam, TBLOB *blob, TBLOB *end_blob);
-
-void account_splits_left_helper(SEAM *seam, TBLOB *blob, TBLOB *end_blob,
-                                inT32 *depth, inT8 *width, inT8 *found_em);
+int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
+                   int blob_direction);

 bool find_split_in_blob(SPLIT *split, TBLOB *blob);

-SEAM *join_two_seams(SEAM *seam1, SEAM *seam2);
-
-SEAM *new_seam(PRIORITY priority,
-               const TPOINT& location,
-               SPLIT *split1,
-               SPLIT *split2,
-               SPLIT *split3);
-
-SEAMS new_seam_list();
+SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2);

 void print_seam(const char *label, SEAM *seam);

-void print_seams(const char *label, SEAMS seams);
+void print_seams(const char *label, const GenericVector<SEAM*>& seams);

-int shared_split_points(SEAM *seam1, SEAM *seam2);
+int shared_split_points(const SEAM *seam1, const SEAM *seam2);

-void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end);
+void break_pieces(const GenericVector<SEAM*>& seams,
+                  int first, int last, TWERD *word);

-void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end);
+void join_pieces(const GenericVector<SEAM*>& seams,
+                 int first, int last, TWERD *word);

 void hide_seam(SEAM *seam);

--- a/ccstruct/split.cpp
+++ b/ccstruct/split.cpp
@ -26,8 +26,8 @@
              I n c l u d e s
 ----------------------------------------------------------------------*/
 #include "split.h"
-#include "structures.h"
-#include "callcpp.h"
+#include "coutln.h"
+#include "tprintf.h"

 #ifdef __UNIX__
 #include <assert.h>
@ -38,8 +38,6 @@
 ----------------------------------------------------------------------*/
 BOOL_VAR(wordrec_display_splits, 0, "Display splits");

-makestructure(newsplit, free_split, SPLIT);
-
 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
@ -47,12 +45,11 @@ makestructure(newsplit, free_split, SPLIT);
 /**********************************************************************
 * delete_split
 *
- * Remove this split from existance.  Take if off the display list and
- * deallocate its memory.
+ * Remove this split from existence.
 **********************************************************************/
 void delete_split(SPLIT *split) { 
  if (split) {
-    free_split(split); 
+    delete split;
  }
 }

@ -68,6 +65,43 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
  this_edgept = new EDGEPT;
  this_edgept->pos.x = x;
  this_edgept->pos.y = y;
+  // Now deal with the src_outline steps.
+  C_OUTLINE* prev_ol = prev->src_outline;
+  if (prev_ol != NULL && prev->next == next) {
+    // Compute the fraction of the segment that is being cut.
+    FCOORD segment_vec(next->pos.x - prev->pos.x, next->pos.y - prev->pos.y);
+    FCOORD target_vec(x - prev->pos.x, y - prev->pos.y);
+    double cut_fraction = target_vec.length() / segment_vec.length();
+    // Get the start and end at the step level.
+    ICOORD step_start = prev_ol->position_at_index(prev->start_step);
+    int end_step = prev->start_step + prev->step_count;
+    int step_length = prev_ol->pathlength();
+    ICOORD step_end = prev_ol->position_at_index(end_step % step_length);
+    ICOORD step_vec = step_end - step_start;
+    double target_length = step_vec.length() * cut_fraction;
+    // Find the point on the segment that gives the length nearest to target.
+    int best_step = prev->start_step;
+    ICOORD total_step(0, 0);
+    double best_dist = target_length;
+    for (int s = prev->start_step; s < end_step; ++s) {
+      total_step += prev_ol->step(s % step_length);
+      double dist = fabs(target_length - total_step.length());
+      if (dist < best_dist) {
+        best_dist = dist;
+        best_step = s + 1;
+      }
+    }
+    // The new point is an intermediate point.
+    this_edgept->src_outline = prev_ol;
+    this_edgept->step_count = end_step - best_step;
+    this_edgept->start_step = best_step % step_length;
+    prev->step_count = best_step - prev->start_step;
+  } else {
+    // The new point is poly only.
+    this_edgept->src_outline = NULL;
+    this_edgept->step_count = 0;
+    this_edgept->start_step = 0;
+  }
  /* Hook it up */
  this_edgept->next = next;
  this_edgept->prev = prev;
@ -78,8 +112,7 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
  this_edgept->vec.y = this_edgept->next->pos.y - y;
  this_edgept->prev->vec.x = x - this_edgept->prev->pos.x;
  this_edgept->prev->vec.y = y - this_edgept->prev->pos.y;
-
-  return (this_edgept);
+  return this_edgept;
 }

 /**********************************************************************
@ -90,6 +123,10 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
 void remove_edgept(EDGEPT *point) {
  EDGEPT *prev = point->prev;
  EDGEPT *next = point->next;
+  // Add point's steps onto prev's steps if they are from the same outline.
+  if (prev->src_outline == point->src_outline && prev->src_outline != NULL) {
+    prev->step_count += point->step_count;
+  }
  prev->next = next;
  next->prev = prev;
  prev->vec.x = next->pos.x - prev->pos.x;
@ -104,8 +141,7 @@ void remove_edgept(EDGEPT *point) {
 * list.
 **********************************************************************/
 SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) { 
-  SPLIT *s;
-  s = (SPLIT *) newsplit ();
+  SPLIT *s = new SPLIT;
  s->point1 = point1;
  s->point2 = point2;
  return (s);
@ -120,9 +156,9 @@ SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) {
 **********************************************************************/
 void print_split(SPLIT *split) { 
  if (split) {
-    cprintf ("(%d,%d)--(%d,%d)",
-      split->point1->pos.x, split->point1->pos.y,
-      split->point2->pos.x, split->point2->pos.y);
+    tprintf("(%d,%d)--(%d,%d)",
+            split->point1->pos.x, split->point1->pos.y,
+            split->point2->pos.x, split->point2->pos.y);
  }
 }

@ -130,23 +166,35 @@ void print_split(SPLIT *split) {
 /**********************************************************************
 * split_outline
 *
- * Split between these two edge points. Apply a split and return a
- * pointer to the other side of the split.
+ * Split between these two edge points.
 **********************************************************************/
 void split_outline(EDGEPT *join_point1, EDGEPT *join_point2) { 
-  EDGEPT *join_point1a;
-  EDGEPT *temp2;
-  EDGEPT *temp1;
+  assert(join_point1 != join_point2);

-  assert (join_point1 != join_point2);
-
-  temp2 = join_point2->next;
-  temp1 = join_point1->next;
+  EDGEPT* temp2 = join_point2->next;
+  EDGEPT* temp1 = join_point1->next;
  /* Create two new points */
-  join_point1a = make_edgept (join_point1->pos.x,
-    join_point1->pos.y, temp1, join_point2);
-
-  make_edgept (join_point2->pos.x, join_point2->pos.y, temp2, join_point1);
+  EDGEPT* new_point1 = make_edgept(join_point1->pos.x, join_point1->pos.y,
+                                   temp1, join_point2);
+  EDGEPT* new_point2 = make_edgept(join_point2->pos.x, join_point2->pos.y,
+                                   temp2, join_point1);
+  // Join_point1 and 2 are now cross-over points, so they must have NULL
+  // src_outlines and give their src_outline information their new
+  // replacements.
+  new_point1->src_outline = join_point1->src_outline;
+  new_point1->start_step = join_point1->start_step;
+  new_point1->step_count = join_point1->step_count;
+  new_point2->src_outline = join_point2->src_outline;
+  new_point2->start_step = join_point2->start_step;
+  new_point2->step_count = join_point2->step_count;
+  join_point1->src_outline = NULL;
+  join_point1->start_step = 0;
+  join_point1->step_count = 0;
+  join_point2->src_outline = NULL;
+  join_point2->start_step = 0;
+  join_point2->step_count = 0;
+  join_point1->MarkChop();
+  join_point2->MarkChop();
 }


@ -164,8 +212,18 @@ void unsplit_outlines(EDGEPT *p1, EDGEPT *p2) {
  tmp1->next->prev = p2;
  tmp2->next->prev = p1;

+  // tmp2 is coincident with p1. p1 takes tmp2's place as tmp2 is deleted.
  p1->next = tmp2->next;
+  p1->src_outline = tmp2->src_outline;
+  p1->start_step = tmp2->start_step;
+  p1->step_count = tmp2->step_count;
+  // Likewise p2 takes tmp1's place.
  p2->next = tmp1->next;
+  p2->src_outline = tmp1->src_outline;
+  p2->start_step = tmp1->start_step;
+  p2->step_count = tmp1->step_count;
+  p1->UnmarkChop();
+  p2->UnmarkChop();

  delete tmp1;
  delete tmp2;
--- a/ccstruct/vecfuncs.h
+++ b/ccstruct/vecfuncs.h
@ -42,8 +42,7 @@ class EDGEPT;

 #define point_diff(p,p1,p2)  \
 ((p).x = (p1).x - (p2).x,        \
-	(p).y = (p1).y - (p2).y,        \
-	(p))
+	(p).y = (p1).y - (p2).y)

 /**********************************************************************
 * CROSS
--- a/ccstruct/werd.cpp
+++ b/ccstruct/werd.cpp
@ -465,7 +465,7 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
      TBOX a_blob_box = a_blob->bounding_box();
      if ((not_found_box.major_overlap(a_blob_box) ||
           a_blob_box.major_overlap(not_found_box)) &&
-           not_found_box.y_overlap(a_blob_box)) {
+           not_found_box.y_overlap(a_blob_box) > 0.8) {
        // Already taken care of.
        delete not_found_it.extract();
        break;
--- a/ccutil/Makefile.am
+++ b/ccutil/Makefile.am
@ -10,18 +10,16 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
 AM_CPPFLAGS += -DTESS_EXPORTS
 endif

-EXTRA_DIST = mfcpch.cpp
-
 include_HEADERS = \
 	basedir.h errcode.h fileerr.h genericvector.h helpers.h host.h memry.h \
 	ndminx.h params.h ocrclass.h platform.h serialis.h strngs.h \
 	tesscallback.h unichar.h unicharmap.h unicharset.h

 noinst_HEADERS = \
-    ambigs.h bits16.h bitvector.h ccutil.h clst.h elst2.h \
-    elst.h globaloc.h hashfn.h indexmapbidi.h lsterr.h \
-    nwmain.h qrsequence.h secname.h sorthelper.h stderr.h tessdatamanager.h \
-     tprintf.h unicity_table.h unicodes.h 
+    ambigs.h bits16.h bitvector.h ccutil.h clst.h doubleptr.h elst2.h \
+    elst.h genericheap.h globaloc.h hashfn.h indexmapbidi.h kdpair.h lsterr.h \
+    nwmain.h object_cache.h qrsequence.h secname.h sorthelper.h stderr.h tessdatamanager.h \
+     tprintf.h unicity_table.h unicodes.h universalambigs.h

 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_ccutil.la
@ -39,7 +37,7 @@ libtesseract_ccutil_la_SOURCES = \
    serialis.cpp strngs.cpp \
    tessdatamanager.cpp tprintf.cpp \
    unichar.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
-    params.cpp
+    params.cpp universalambigs.cpp

 if EMBEDDED
 include_HEADERS += scanutils.h
@ -50,4 +48,4 @@ if MINGW
 AM_CPPFLAGS += -I$(top_srcdir)/vs2008/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
 noinst_HEADERS += ../vs2008/port/strtok_r.h
 libtesseract_ccutil_la_SOURCES += ../vs2008/port/strtok_r.cpp
-endif
+endif
--- a/ccutil/ambigs.cpp
+++ b/ccutil/ambigs.cpp
@ -19,7 +19,10 @@
 ///////////////////////////////////////////////////////////////////////

 #include "ambigs.h"
+
+#include <stdio.h>
 #include "helpers.h"
+#include "universalambigs.h"

 #ifdef _WIN32
 #ifndef __GNUC__
@ -31,6 +34,11 @@

 namespace tesseract {

+// Maximum line size:
+//   10 for sizes of ambigs, tabs, abmig type and newline
+//   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
+const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
+
 AmbigSpec::AmbigSpec() {
  wrong_ngram[0] = INVALID_UNICHAR_ID;
  correct_fragments[0] = INVALID_UNICHAR_ID;
@ -41,14 +49,10 @@ AmbigSpec::AmbigSpec() {

 ELISTIZE(AmbigSpec);

-void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
-                                      inT64 end_offset,
-                                      int debug_level,
-                                      bool use_ambigs_for_adaption,
-                                      UNICHARSET *unicharset) {
-  int i, j;
-  UnicharIdVector *adaption_ambigs_entry;
-  for (i = 0; i < unicharset->size(); ++i) {
+// Initializes the ambigs by adding a NULL pointer to each table.
+void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET& unicharset,
+                                      bool use_ambigs_for_adaption) {
+  for (int i = 0; i < unicharset.size(); ++i) {
    replace_ambigs_.push_back(NULL);
    dang_ambigs_.push_back(NULL);
    one_to_one_definite_ambigs_.push_back(NULL);
@ -57,85 +61,103 @@ void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
      reverse_ambigs_for_adaption_.push_back(NULL);
    }
  }
+}
+
+// Loads the universal ambigs that are useful for any language.
+void UnicharAmbigs::LoadUniversal(const UNICHARSET& encoder_set,
+                                  UNICHARSET* unicharset) {
+  FILE* fp = fmemopen(const_cast<char*>(kUniversalAmbigsFile),
+                      ksizeofUniversalAmbigsFile, "rb");
+  if (fp == NULL) return;
+  LoadUnicharAmbigs(encoder_set, fp, -1ll, 0, false, unicharset);
+  fclose(fp);
+}
+
+void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
+                                      FILE *ambig_file,
+                                      inT64 end_offset,
+                                      int debug_level,
+                                      bool use_ambigs_for_adaption,
+                                      UNICHARSET *unicharset) {
+  int i, j;
+  UnicharIdVector *adaption_ambigs_entry;
  if (debug_level) tprintf("Reading ambiguities\n");

-  int TestAmbigPartSize;
-  int ReplacementAmbigPartSize;
-  // Maximum line size:
-  //   10 for sizes of ambigs, tabs, abmig type and newline
-  //   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
+  int test_ambig_part_size;
+  int replacement_ambig_part_size;
  // The space for buffer is allocated on the heap to avoid
  // GCC frame size warning.
-  const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
  char *buffer = new char[kBufferSize];
-  char ReplacementString[kMaxAmbigStringSize];
-  UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
+  char replacement_string[kMaxAmbigStringSize];
+  UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
  int line_num = 0;
  int type = NOT_AMBIG;

  // Determine the version of the ambigs file.
  int version = 0;
-  ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
+  ASSERT_HOST(fgets(buffer, kBufferSize, ambig_file) != NULL &&
              strlen(buffer) > 0);
  if (*buffer == 'v') {
    version = static_cast<int>(strtol(buffer+1, NULL, 10));
    ++line_num;
  } else {
-    rewind(AmbigFile);
+    rewind(ambig_file);
  }
-  while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
-         fgets(buffer, kBufferSize, AmbigFile) != NULL) {
+  while ((end_offset < 0 || ftell(ambig_file) < end_offset) &&
+         fgets(buffer, kBufferSize, ambig_file) != NULL) {
    chomp_string(buffer);
    if (debug_level > 2) tprintf("read line %s\n", buffer);
    ++line_num;
-    if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset,
-                            buffer, &TestAmbigPartSize, TestUnicharIds,
-                            &ReplacementAmbigPartSize,
-                            ReplacementString, &type)) continue;
+    if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
+                            buffer, &test_ambig_part_size, test_unichar_ids,
+                            &replacement_ambig_part_size,
+                            replacement_string, &type)) continue;
    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
    AmbigSpec *ambig_spec = new AmbigSpec();
-    InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
-                    TestAmbigPartSize, TestUnicharIds,
-                    ReplacementAmbigPartSize, ReplacementString, type,
-                    ambig_spec, unicharset);
+    if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_
+                                                 : dang_ambigs_,
+                         test_ambig_part_size, test_unichar_ids,
+                         replacement_ambig_part_size, replacement_string, type,
+                         ambig_spec, unicharset))
+      continue;

    // Update one_to_one_definite_ambigs_.
-    if (TestAmbigPartSize == 1 &&
-        ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
-      if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
-        one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
+    if (test_ambig_part_size == 1 &&
+        replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {
+      if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) {
+        one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();
      }
-      one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
+      one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
          ambig_spec->correct_ngram_id);
    }
    // Update ambigs_for_adaption_.
    if (use_ambigs_for_adaption) {
-      for (i = 0; i < TestAmbigPartSize; ++i) {
-        if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) {
-          ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector();
-        }
-        adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]];
-        const char *tmp_ptr = ReplacementString;
-        const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString);
-        int step = unicharset->step(tmp_ptr);
-        while (step > 0) {
-          UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step);
-          ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
-          // Add the new unichar id to adaption_ambigs_entry (only if the
-          // vector does not already contain it) keeping it in sorted order.
-          for (j = 0; j < adaption_ambigs_entry->size() &&
-               (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
-          if (j < adaption_ambigs_entry->size()) {
-            if ((*adaption_ambigs_entry)[j] != id_to_insert) {
-              adaption_ambigs_entry->insert(id_to_insert, j);
-            }
-          } else {
-            adaption_ambigs_entry->push_back(id_to_insert);
+      GenericVector<UNICHAR_ID> encoding;
+      // Silently ignore invalid strings, as before, so it is safe to use a
+      // universal ambigs file.
+      if (unicharset->encode_string(replacement_string, true, &encoding,
+                                    NULL, NULL)) {
+        for (i = 0; i < test_ambig_part_size; ++i) {
+          if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) {
+            ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
+          }
+          adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
+          for (int r = 0; r < encoding.size(); ++r) {
+            UNICHAR_ID id_to_insert = encoding[r];
+            ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
+            // Add the new unichar id to adaption_ambigs_entry (only if the
+            // vector does not already contain it) keeping it in sorted order.
+            for (j = 0; j < adaption_ambigs_entry->size() &&
+                 (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
+            if (j < adaption_ambigs_entry->size()) {
+              if ((*adaption_ambigs_entry)[j] != id_to_insert) {
+                adaption_ambigs_entry->insert(id_to_insert, j);
+              }
+            } else {
+              adaption_ambigs_entry->push_back(id_to_insert);
+            }
          }
-          // Update tmp_ptr and step.
-          tmp_ptr += step;
-          step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0;
        }
      }
    }
@ -204,51 +226,96 @@ void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,

 bool UnicharAmbigs::ParseAmbiguityLine(
    int line_num, int version, int debug_level, const UNICHARSET &unicharset,
-    char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
-    int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
+    char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
+    int *replacement_ambig_part_size, char *replacement_string, int *type) {
+  if (version > 1) {
+    // Simpler format is just wrong-string correct-string type\n.
+    STRING input(buffer);
+    GenericVector<STRING> fields;
+    input.split(' ', &fields);
+    if (fields.size() != 3) {
+      if (debug_level) tprintf(kIllegalMsg, line_num);
+      return false;
+    }
+    // Encode wrong-string.
+    GenericVector<UNICHAR_ID> unichars;
+    if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL,
+                                  NULL)) {
+      return false;
+    }
+    *test_ambig_part_size = unichars.size();
+    if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
+      if (debug_level)
+        tprintf("Too many unichars in ambiguity on line %d\n", line_num);
+      return false;
+    }
+    // Copy encoded string to output.
+    for (int i = 0; i < unichars.size(); ++i)
+      test_unichar_ids[i] = unichars[i];
+    test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
+    // Encode replacement-string to check validity.
+    if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL,
+                                  NULL)) {
+      return false;
+    }
+    *replacement_ambig_part_size = unichars.size();
+    if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
+      if (debug_level)
+        tprintf("Too many unichars in ambiguity on line %d\n", line_num);
+      return false;
+    }
+    if (sscanf(fields[2].string(), "%d", type) != 1) {
+      if (debug_level) tprintf(kIllegalMsg, line_num);
+      return false;
+    }
+    snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string());
+    return true;
+  }
  int i;
  char *token;
  char *next_token;
  if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
-      !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
+      !sscanf(token, "%d", test_ambig_part_size) || test_ambig_part_size <= 0) {
    if (debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
-  if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
-    tprintf("Too many unichars in ambiguity on line %d\n");
+  if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
+    if (debug_level)
+      tprintf("Too many unichars in ambiguity on line %d\n", line_num);
    return false;
  }
-  for (i = 0; i < *TestAmbigPartSize; ++i) {
+  for (i = 0; i < *test_ambig_part_size; ++i) {
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
    if (!unicharset.contains_unichar(token)) {
      if (debug_level) tprintf(kIllegalUnicharMsg, token);
      break;
    }
-    TestUnicharIds[i] = unicharset.unichar_to_id(token);
+    test_unichar_ids[i] = unicharset.unichar_to_id(token);
  }
-  TestUnicharIds[i] = INVALID_UNICHAR_ID;
+  test_unichar_ids[i] = INVALID_UNICHAR_ID;

-  if (i != *TestAmbigPartSize ||
+  if (i != *test_ambig_part_size ||
      !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
-      !sscanf(token, "%d", ReplacementAmbigPartSize) ||
-        *ReplacementAmbigPartSize <= 0) {
+      !sscanf(token, "%d", replacement_ambig_part_size) ||
+        *replacement_ambig_part_size <= 0) {
    if (debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
-  if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
-    tprintf("Too many unichars in ambiguity on line %d\n");
+  if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
+    if (debug_level)
+      tprintf("Too many unichars in ambiguity on line %d\n", line_num);
    return false;
  }
-  ReplacementString[0] = '\0';
-  for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
+  replacement_string[0] = '\0';
+  for (i = 0; i < *replacement_ambig_part_size; ++i) {
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
-    strcat(ReplacementString, token);
+    strcat(replacement_string, token);
    if (!unicharset.contains_unichar(token)) {
      if (debug_level) tprintf(kIllegalUnicharMsg, token);
      break;
    }
  }
-  if (i != *ReplacementAmbigPartSize) {
+  if (i != *replacement_ambig_part_size) {
    if (debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
@ -271,20 +338,20 @@ bool UnicharAmbigs::ParseAmbiguityLine(
  return true;
 }

-void UnicharAmbigs::InsertIntoTable(
-    UnicharAmbigsVector &table, int TestAmbigPartSize,
-    UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
-    const char *ReplacementString, int type,
+bool UnicharAmbigs::InsertIntoTable(
+    UnicharAmbigsVector &table, int test_ambig_part_size,
+    UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
+    const char *replacement_string, int type,
    AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
  ambig_spec->type = static_cast<AmbigType>(type);
-  if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
-      unicharset->to_lower(TestUnicharIds[0]) ==
-      unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
+  if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&
+      unicharset->to_lower(test_unichar_ids[0]) ==
+      unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) {
    ambig_spec->type = CASE_AMBIG;
  }

  ambig_spec->wrong_ngram_size =
-    UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
+    UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);

  // Since we need to maintain a constant number of unichar positions in
  // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
@ -297,21 +364,21 @@ void UnicharAmbigs::InsertIntoTable(
  // Insert the corresponding correct ngram into the unicharset.
  // Unicharset code assumes that the "base" ngram is inserted into
  // the unicharset before fragments of this ngram are inserted.
-  unicharset->unichar_insert(ReplacementString);
+  unicharset->unichar_insert(replacement_string);
  ambig_spec->correct_ngram_id =
-    unicharset->unichar_to_id(ReplacementString);
-  if (ReplacementAmbigPartSize > 1) {
+    unicharset->unichar_to_id(replacement_string);
+  if (replacement_ambig_part_size > 1) {
    unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
  }
  // Add the corresponding fragments of the wrong ngram to unicharset.
  int i;
-  for (i = 0; i < TestAmbigPartSize; ++i) {
+  for (i = 0; i < test_ambig_part_size; ++i) {
    UNICHAR_ID unichar_id;
-    if (TestAmbigPartSize == 1) {
+    if (test_ambig_part_size == 1) {
      unichar_id = ambig_spec->correct_ngram_id;
    } else {
      STRING frag_str = CHAR_FRAGMENT::to_string(
-          ReplacementString, i, TestAmbigPartSize, false);
+          replacement_string, i, test_ambig_part_size, false);
      unicharset->unichar_insert(frag_str.string());
      unichar_id = unicharset->unichar_to_id(frag_str.string());
    }
@ -321,11 +388,14 @@ void UnicharAmbigs::InsertIntoTable(

  // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
  // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
-  if (table[TestUnicharIds[0]] == NULL) {
-    table[TestUnicharIds[0]] = new AmbigSpec_LIST();
+  if (table[test_unichar_ids[0]] == NULL) {
+    table[test_unichar_ids[0]] = new AmbigSpec_LIST();
  }
-  table[TestUnicharIds[0]]->add_sorted(
-      AmbigSpec::compare_ambig_specs, false, ambig_spec);
+  if (table[test_unichar_ids[0]]->add_sorted(
+        AmbigSpec::compare_ambig_specs, true, ambig_spec))
+    return true;
+  delete ambig_spec;
+  return false;
 }

 }  // namespace tesseract
--- a/ccutil/ambigs.h
+++ b/ccutil/ambigs.h
@ -123,7 +123,10 @@ class AmbigSpec : public ELIST_LINK {
      *reinterpret_cast<const AmbigSpec * const *>(spec1);
    const AmbigSpec *s2 =
      *reinterpret_cast<const AmbigSpec * const *>(spec2);
-    return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
+    int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
+    if (result != 0) return result;
+    return UnicharIdArrayUtils::compare(s1->correct_fragments,
+                                        s2->correct_fragments);
  }

  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
@ -150,6 +153,13 @@ class UnicharAmbigs {
  const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
  const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }

+  // Initializes the ambigs by adding a NULL pointer to each table.
+  void InitUnicharAmbigs(const UNICHARSET& unicharset,
+                         bool use_ambigs_for_adaption);
+
+  // Loads the universal ambigs that are useful for any language.
+  void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
+
  // Fills in two ambiguity tables (replaceable and dangerous) with information
  // read from the ambigs file. An ambiguity table is an array of lists.
  // The array is indexed by a class id. Each entry in the table provides
@ -160,7 +170,10 @@ class UnicharAmbigs {
  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
  // of the wrong part of the ambiguity and each entry contains a vector of
  // unichar ids that are ambiguous to it.
-  void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level,
+  // encoder_set is used to encode the ambiguity strings, undisturbed by new
+  // unichar_ids that may be created by adding the ambigs.
+  void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
+                         FILE *ambigs_file, inT64 end_offset, int debug_level,
                         bool use_ambigs_for_adaption, UNICHARSET *unicharset);

  // Returns definite 1-1 ambigs for the given unichar id.
@ -191,17 +204,18 @@ class UnicharAmbigs {
  }

 private:
-
  bool ParseAmbiguityLine(int line_num, int version, int debug_level,
                          const UNICHARSET &unicharset, char *buffer,
-                          int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
-                          int *ReplacementAmbigPartSize,
-                          char *ReplacementString, int *type);
-  void InsertIntoTable(UnicharAmbigsVector &table,
-                       int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
-                       int ReplacementAmbigPartSize,
-                       const char *ReplacementString, int type,
+                          int *test_ambig_part_size,
+                          UNICHAR_ID *test_unichar_ids,
+                          int *replacement_ambig_part_size,
+                          char *replacement_string, int *type);
+  bool InsertIntoTable(UnicharAmbigsVector &table,
+                       int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
+                       int replacement_ambig_part_size,
+                       const char *replacement_string, int type,
                       AmbigSpec *ambig_spec, UNICHARSET *unicharset);
+
  UnicharAmbigsVector dang_ambigs_;
  UnicharAmbigsVector replace_ambigs_;
  GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
--- a/ccutil/doubleptr.h
+++ b/ccutil/doubleptr.h
@ -0,0 +1,93 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        doubleptr.h
+// Description: Double-ended pointer that keeps pointing correctly even
+//              when reallocated or copied.
+// Author:      Ray Smith
+// Created:     Wed Mar 14 12:22:57 PDT 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_
+#define TESSERACT_CCUTIL_DOUBLEPTR_H_
+
+#include "errcode.h"
+
+namespace tesseract {
+
+// A smart pointer class that implements a double-ended pointer. Each end
+// points to the other end. The copy constructor and operator= have MOVE
+// semantics, meaning that the relationship with the other end moves to the
+// destination of the copy, leaving the source unattached.
+// For this reason both the copy constructor and the operator= take a non-const
+// reference argument, and the const reference versions cannot be used.
+// DoublePtr is useful to incorporate into structures that are part of a
+// collection such as GenericVector or STL containers, where reallocs can
+// relocate the members. DoublePtr is also useful in a GenericHeap, where it
+// can correctly maintain the pointer to an element of the heap despite it
+// getting moved around on the heap.
+class DoublePtr {
+ public:
+  DoublePtr() : other_end_(NULL) {}
+  // Copy constructor steals the partner off src and is therefore a non
+  // const reference arg.
+  // Copying a const DoublePtr generates a compiler error.
+  DoublePtr(DoublePtr& src) {
+    other_end_ = src.other_end_;
+    if (other_end_ != NULL) {
+      other_end_->other_end_ = this;
+      src.other_end_ = NULL;
+    }
+  }
+  // Operator= steals the partner off src, and therefore needs src to be a non-
+  // const reference.
+  // Assigning from a const DoublePtr generates a compiler error.
+  void operator=(DoublePtr& src) {
+    Disconnect();
+    other_end_ = src.other_end_;
+    if (other_end_ != NULL) {
+      other_end_->other_end_ = this;
+      src.other_end_ = NULL;
+    }
+  }
+
+  // Connects this and other, discarding any existing connections.
+  void Connect(DoublePtr* other) {
+    other->Disconnect();
+    Disconnect();
+    other->other_end_ = this;
+    other_end_ = other;
+  }
+  // Disconnects this and other, making OtherEnd() return NULL for both.
+  void Disconnect() {
+    if (other_end_ != NULL) {
+      other_end_->other_end_ = NULL;
+      other_end_ = NULL;
+    }
+  }
+  // Returns the pointer to the other end of the double pointer.
+  DoublePtr* OtherEnd() const {
+    return other_end_;
+  }
+
+ private:
+  // Pointer to the other end of the link. It is always true that either
+  // other_end_ == NULL or other_end_->other_end_ == this.
+  DoublePtr* other_end_;
+};
+
+}  // namespace tesseract.
+
+#endif  // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_
--- a/ccutil/errcode.h
+++ b/ccutil/errcode.h
@ -90,12 +90,6 @@ const ERRCODE ASSERT_FAILED = "Assert failed";
 void signal_exit(                 //
                 int signal_code  //Signal which
                );
-extern "C"
-{
-  void err_exit();
-                                 //The real signal
-  void signal_termination_handler(int sig);
-};

 void set_global_loc_code(int loc_code);

--- a/ccutil/genericheap.h
+++ b/ccutil/genericheap.h
@ -0,0 +1,225 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        genericheap.h
+// Description: Template heap class.
+// Author:      Ray Smith, based on Dan Johnson's original code.
+// Created:     Wed Mar 14 08:13:00 PDT 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "errcode.h"
+#include "genericvector.h"
+
+#ifndef TESSERACT_CCUTIL_GENERICHEAP_H_
+#define TESSERACT_CCUTIL_GENERICHEAP_H_
+
+namespace tesseract {
+
+// GenericHeap requires 1 template argument:
+// Pair will normally be either KDPairInc<Key, Data> or KDPairDec<Key, Data>
+// for some arbitrary Key and scalar, smart pointer, or non-ownership pointer
+// Data type, according to whether a MIN heap or a MAX heap is desired,
+// respectively. Using KDPtrPairInc<Key, Data> or KDPtrPairDec<Key, Data>,
+// GenericHeap can also handle simple Data pointers and own them.
+// If no additional data is required, Pair can also be a scalar, since
+// GenericHeap doesn't look inside it except for operator<.
+//
+// The heap is stored as a packed binary tree in an array hosted by a
+// GenericVector<Pair>, with the invariant that the children of each node are
+// both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<
+// to use Key::operator< to generate a MIN heap and KDPairDec defines
+// Pair::operator< to use Key::operator> to generate a MAX heap by reversing
+// all the comparisons.
+// See http://en.wikipedia.org/wiki/Heap_(data_structure) for more detail on
+// the basic heap implementation.
+//
+// Insertion and removal are both O(log n) and, unlike the STL heap, an
+// explicit Reshuffle function allows a node to be repositioned in time O(log n)
+// after changing its value.
+//
+// Accessing the element for revaluation is a more complex matter, since the
+// index and pointer can be changed arbitrarily by heap operations.
+// Revaluation can be done by making the Data type in the Pair derived from or
+// contain a DoublePtr as its first data element, making it possible to convert
+// the pointer to a Pair using KDPairInc::RecastDataPointer.
+template <typename Pair>
+class GenericHeap {
+ public:
+  GenericHeap() {}
+  // The initial size is only a GenericVector::reserve. It is not enforced as
+  // the size limit of the heap. Caller must implement their own enforcement.
+  explicit GenericHeap(int initial_size) {
+    heap_.reserve(initial_size);
+  }
+
+  // Simple accessors.
+  bool empty() const {
+    return heap_.empty();
+  }
+  int size() const {
+    return heap_.size();
+  }
+  int size_reserved() const {
+    return heap_.size_reserved();
+  }
+  void clear() {
+    // Clear truncates to 0 to keep the number reserved in tact.
+    heap_.truncate(0);
+  }
+  // Provides access to the underlying vector.
+  // Caution! any changes that modify the keys will invalidate the heap!
+  GenericVector<Pair>* heap() {
+    return &heap_;
+  }
+
+  // Add entry to the heap, keeping the smallest item at the top, by operator<.
+  // Note that *entry is used as the source of operator=, but it is non-const
+  // to allow for a smart pointer to be contained within.
+  // Time = O(log n).
+  void Push(Pair* entry) {
+    int hole_index = heap_.size();
+    // Make a hole in the end of heap_ and sift it up to be the correct
+    // location for the new *entry. To avoid needing a default constructor
+    // for primitive types, and to allow for use of DoublePtr in the Pair
+    // somewhere, we have to incur a double copy here.
+    heap_.push_back(*entry);
+    *entry = heap_.back();
+    hole_index = SiftUp(hole_index, *entry);
+    heap_[hole_index] = *entry;
+  }
+
+  // Get the value of the top (smallest, defined by operator< ) element.
+  const Pair& PeekTop() const {
+    return heap_[0];
+  }
+
+  // Removes the top element of the heap. If entry is not NULL, the element
+  // is copied into *entry, otherwise it is discarded.
+  // Returns false if the heap was already empty.
+  // Time = O(log n).
+  bool Pop(Pair* entry) {
+    int new_size = heap_.size() - 1;
+    if (new_size < 0)
+      return false;  // Already empty.
+    if (entry != NULL)
+      *entry = heap_[0];
+    if (new_size > 0) {
+      // Sift the hole at the start of the heap_ downwards to match the last
+      // element.
+      Pair hole_pair = heap_[new_size];
+      heap_.truncate(new_size);
+      int hole_index = SiftDown(0, hole_pair);
+      heap_[hole_index] = hole_pair;
+    } else {
+      heap_.truncate(new_size);
+    }
+    return true;
+  }
+
+  // Removes the MAXIMUM element of the heap. (MIN from a MAX heap.) If entry is
+  // not NULL, the element is copied into *entry, otherwise it is discarded.
+  // Time = O(n). Returns false if the heap was already empty.
+  bool PopWorst(Pair* entry) {
+    int heap_size = heap_.size();
+    if (heap_size == 0) return false;  // It cannot be empty!
+
+    // Find the maximum element. Its index is guaranteed to be greater than
+    // the index of the parent of the last element, since by the heap invariant
+    // the parent must be less than or equal to the children.
+    int worst_index = heap_size - 1;
+    int end_parent = ParentNode(worst_index);
+    for (int i = worst_index - 1; i > end_parent; --i) {
+      if (heap_[worst_index] < heap_[i])
+        worst_index = i;
+    }
+    // Extract the worst element from the heap, leaving a hole at worst_index.
+    if (entry != NULL)
+      *entry = heap_[worst_index];
+    --heap_size;
+    if (heap_size > 0) {
+      // Sift the hole upwards to match the last element of the heap_
+      Pair hole_pair = heap_[heap_size];
+      int hole_index = SiftUp(worst_index, hole_pair);
+      heap_[hole_index] = hole_pair;
+    }
+    heap_.truncate(heap_size);
+    return true;
+  }
+
+  // The pointed-to Pair has changed its key value, so the location of pair
+  // is reshuffled to maintain the heap invariant.
+  // Must be a valid pointer to an element of the heap_!
+  // Caution! Since GenericHeap is based on GenericVector, reallocs may occur
+  // whenever the vector is extended and elements may get shuffled by any
+  // Push or Pop operation. Therefore use this function only if Data in Pair is
+  // of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as
+  // its first element. Reshuffles the heap to maintain the invariant.
+  // Time = O(log n).
+  void Reshuffle(Pair* pair) {
+    int index = pair - &heap_[0];
+    Pair hole_pair = heap_[index];
+    index = SiftDown(index, hole_pair);
+    index = SiftUp(index, hole_pair);
+    heap_[index] = hole_pair;
+  }
+
+ private:
+  // A hole in the heap exists at hole_index, and we want to fill it with the
+  // given pair. SiftUp sifts the hole upward to the correct position and
+  // returns the destination index without actually putting pair there.
+  int SiftUp(int hole_index, const Pair& pair) {
+    int parent;
+    while (hole_index > 0 && pair < heap_[parent = ParentNode(hole_index)]) {
+      heap_[hole_index] = heap_[parent];
+      hole_index = parent;
+    }
+    return hole_index;
+  }
+
+  // A hole in the heap exists at hole_index, and we want to fill it with the
+  // given pair. SiftDown sifts the hole downward to the correct position and
+  // returns the destination index without actually putting pair there.
+  int SiftDown(int hole_index, const Pair& pair) {
+    int heap_size = heap_.size();
+    int child;
+    while ((child = LeftChild(hole_index)) < heap_size) {
+      if (child + 1 < heap_size && heap_[child + 1] < heap_[child])
+        ++child;
+      if (heap_[child] < pair) {
+        heap_[hole_index] = heap_[child];
+        hole_index = child;
+      } else {
+        break;
+      }
+    }
+    return hole_index;
+  }
+
+  // Functions to navigate the tree. Unlike the original implementation, we
+  // store the root at index 0.
+  int ParentNode(int index) const {
+    return (index + 1) / 2 - 1;
+  }
+  int LeftChild(int index) const {
+    return index * 2 + 1;
+  }
+
+ private:
+  GenericVector<Pair> heap_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCUTIL_GENERICHEAP_H_
--- a/ccutil/genericvector.h
+++ b/ccutil/genericvector.h
@ -20,6 +20,7 @@
 #ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_
 #define TESSERACT_CCUTIL_GENERICVECTOR_H_

+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>

@ -34,8 +35,13 @@
 template <typename T>
 class GenericVector {
 public:
-  GenericVector() { this->init(kDefaultVectorSize); }
-  explicit GenericVector(int size) { this->init(size); }
+  GenericVector() {
+    init(kDefaultVectorSize);
+  }
+  GenericVector(int size, T init_val) {
+    init(size);
+    init_to_size(size, init_val);
+  }

  // Copy
  GenericVector(const GenericVector& other) {
@ -45,7 +51,7 @@ class GenericVector {
  GenericVector<T> &operator+=(const GenericVector& other);
  GenericVector<T> &operator=(const GenericVector& other);

-  virtual ~GenericVector();
+  ~GenericVector();

  // Reserve some memory.
  void reserve(int size);
@ -59,6 +65,9 @@ class GenericVector {
  int size() const {
    return size_used_;
  }
+  int size_reserved() const {
+    return size_reserved_;
+  }

  int length() const {
    return size_used_;
@ -73,6 +82,8 @@ class GenericVector {
  T &get(int index) const;
  T &back() const;
  T &operator[](int index) const;
+  // Returns the last object and removes it.
+  T pop_back();

  // Return the index of the T object.
  // This method NEEDS a compare_callback to be passed to
@ -105,11 +116,11 @@ class GenericVector {

  // Removes an element at the given index and
  // shifts the remaining elements to the left.
-  virtual void remove(int index);
+  void remove(int index);

  // Truncates the array to the given size by removing the end.
  // If the current size is less, the array is not expanded.
-  virtual void truncate(int size) {
+  void truncate(int size) {
    if (size < size_used_)
      size_used_ = size;
  }
@ -126,7 +137,7 @@ class GenericVector {
  // All the owned callbacks are also deleted.
  // If you don't want the callbacks to be deleted, before calling clear, set
  // the callback to NULL.
-  virtual void clear();
+  void clear();

  // Delete objects pointed to by data_[i]
  void delete_data_pointers();
@ -147,12 +158,12 @@ class GenericVector {
  bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
  // Writes a vector of simple types to the given file. Assumes that bitwise
  // read/write of T will work. Returns false in case of error.
-  virtual bool Serialize(FILE* fp) const;
+  bool Serialize(FILE* fp) const;
  // Reads a vector of simple types from the given file. Assumes that bitwise
  // read/write will work with ReverseN according to sizeof(T).
  // Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
-  virtual bool DeSerialize(bool swap, FILE* fp);
+  bool DeSerialize(bool swap, FILE* fp);
  // Writes a vector of classes to the given file. Assumes the existence of
  // bool T::Serialize(FILE* fp) const that returns false in case of error.
  // Returns false in case of error.
@ -262,7 +273,32 @@ class GenericVector {
    return result;
  }

+  // Returns the index of what would be the target_index_th item in the array
+  // if the members were sorted, without actually sorting. Members are
+  // shuffled around, but it takes O(n) time.
+  // NOTE: uses operator< and operator== on the members.
+  int choose_nth_item(int target_index) {
+    // Make sure target_index is legal.
+    if (target_index < 0)
+      target_index = 0;                   // ensure legal
+    else if (target_index >= size_used_)
+      target_index = size_used_ - 1;
+    unsigned int seed = 1;
+    return choose_nth_item(target_index, 0, size_used_, &seed);
+  }
+
+  // Swaps the elements with the given indices.
+  void swap(int index1, int index2) {
+    if (index1 != index2) {
+      T tmp = data_[index1];
+      data_[index1] = data_[index2];
+      data_[index2] = tmp;
+    }
+  }
+
 protected:
+  // Internal recursive version of choose_nth_item.
+  int choose_nth_item(int target_index, int start, int end, unsigned int* seed);

  // Init the object, allocating size memory.
  void init(int size);
@ -328,7 +364,7 @@ class PointerVector : public GenericVector<T*> {
 public:
  PointerVector() : GenericVector<T*>() { }
  explicit PointerVector(int size) : GenericVector<T*>(size) { }
-  virtual ~PointerVector() {
+  ~PointerVector() {
    // Clear must be called here, even though it is called again by the base,
    // as the base will call the wrong clear.
    clear();
@ -355,14 +391,14 @@ class PointerVector : public GenericVector<T*> {

  // Removes an element at the given index and
  // shifts the remaining elements to the left.
-  virtual void remove(int index) {
+  void remove(int index) {
    delete GenericVector<T*>::data_[index];
    GenericVector<T*>::remove(index);
  }

  // Truncates the array to the given size by removing the end.
  // If the current size is less, the array is not expanded.
-  virtual void truncate(int size) {
+  void truncate(int size) {
    for (int i = size; i < GenericVector<T*>::size_used_; ++i)
      delete GenericVector<T*>::data_[i];
    GenericVector<T*>::truncate(size);
@ -394,14 +430,14 @@ class PointerVector : public GenericVector<T*> {
  // All the owned callbacks are also deleted.
  // If you don't want the callbacks to be deleted, before calling clear, set
  // the callback to NULL.
-  virtual void clear() {
+  void clear() {
    GenericVector<T*>::delete_data_pointers();
    GenericVector<T*>::clear();
  }

  // Writes a vector of simple types to the given file. Assumes that bitwise
  // read/write of T will work. Returns false in case of error.
-  virtual bool Serialize(FILE* fp) const {
+  bool Serialize(FILE* fp) const {
    inT32 used = GenericVector<T*>::size_used_;
    if (fwrite(&used, sizeof(used), 1, fp) != 1) return false;
    for (int i = 0; i < used; ++i) {
@ -416,7 +452,7 @@ class PointerVector : public GenericVector<T*> {
  // Also needs T::T(), as new T is used in this function.
  // Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
-  virtual bool DeSerialize(bool swap, FILE* fp) {
+  bool DeSerialize(bool swap, FILE* fp) {
    inT32 reserved;
    if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false;
    if (swap) Reverse32(&reserved);
@ -515,7 +551,8 @@ T &GenericVector<T>::get(int index) const {

 template <typename T>
 T &GenericVector<T>::operator[](int index) const {
- return data_[index];
+  assert(index >= 0 && index < size_used_);
+  return data_[index];
 }

 template <typename T>
@ -523,6 +560,12 @@ T &GenericVector<T>::back() const {
  ASSERT_HOST(size_used_ > 0);
  return data_[size_used_ - 1];
 }
+// Returns the last object and removes it.
+template <typename T>
+T GenericVector<T>::pop_back() {
+  ASSERT_HOST(size_used_ > 0);
+  return data_[--size_used_];
+}

 // Return the object from an index.
 template <typename T>
@ -536,7 +579,7 @@ void GenericVector<T>::set(T t, int index) {
 // at the specified index.
 template <typename T>
 void GenericVector<T>::insert(T t, int index) {
-  ASSERT_HOST(index >= 0 && index < size_used_);
+  ASSERT_HOST(index >= 0 && index <= size_used_);
  if (size_reserved_ == size_used_)
    double_the_size();
  for (int i = size_used_; i > index; --i) {
@ -642,7 +685,8 @@ void GenericVector<T>::set_clear_callback(TessCallback1<T>* cb) {
 // Add a callback to be called to delete the elements when the array took
 // their ownership.
 template <typename T>
-void GenericVector<T>::set_compare_callback(TessResultCallback2<bool, T const &, T const &>* cb) {
+void GenericVector<T>::set_compare_callback(
+    TessResultCallback2<bool, T const &, T const &>* cb) {
  compare_cb_ = cb;
 }

@ -804,4 +848,61 @@ void GenericVector<T>::sort() {
  sort(&tesseract::sort_cmp<T>);
 }

+// Internal recursive version of choose_nth_item.
+// The algorithm used comes from "Algorithms" by Sedgewick:
+// http://books.google.com/books/about/Algorithms.html?id=idUdqdDXqnAC
+// The principle is to choose a random pivot, and move everything less than
+// the pivot to its left, and everything greater than the pivot to the end
+// of the array, then recurse on the part that contains the desired index, or
+// just return the answer if it is in the equal section in the middle.
+// The random pivot guarantees average linear time for the same reason that
+// n times vector::push_back takes linear time on average.
+// target_index, start and and end are all indices into the full array.
+// Seed is a seed for rand_r for thread safety purposes. Its value is
+// unimportant as the random numbers do not affect the result except
+// between equal answers.
+template <typename T>
+int GenericVector<T>::choose_nth_item(int target_index, int start, int end,
+                                      unsigned int* seed) {
+  // Number of elements to process.
+  int num_elements = end - start;
+  // Trivial cases.
+  if (num_elements <= 1)
+    return start;
+  if (num_elements == 2) {
+    if (data_[start] < data_[start + 1]) {
+      return target_index > start ? start + 1 : start;
+    } else {
+      return target_index > start ? start : start + 1;
+    }
+  }
+  // Place the pivot at start.
+  int pivot = rand_r(seed) % num_elements + start;
+  swap(pivot, start);
+  // The invariant condition here is that items [start, next_lesser) are less
+  // than the pivot (which is at index next_lesser) and items
+  // [prev_greater, end) are greater than the pivot, with items
+  // [next_lesser, prev_greater) being equal to the pivot.
+  int next_lesser = start;
+  int prev_greater = end;
+  for (int next_sample = start + 1; next_sample < prev_greater;) {
+    if (data_[next_sample] < data_[next_lesser]) {
+      swap(next_lesser++, next_sample++);
+    } else if (data_[next_sample] == data_[next_lesser]) {
+      ++next_sample;
+    } else {
+      swap(--prev_greater, next_sample);
+    }
+  }
+  // Now the invariant is set up, we recurse on just the section that contains
+  // the desired index.
+  if (target_index < next_lesser)
+    return choose_nth_item(target_index, start, next_lesser, seed);
+  else if (target_index < prev_greater)
+    return next_lesser;          // In equal bracket.
+  else
+    return choose_nth_item(target_index, prev_greater, end, seed);
+}
+
+
 #endif  // TESSERACT_CCUTIL_GENERICVECTOR_H_
--- a/ccutil/globaloc.cpp
+++ b/ccutil/globaloc.cpp
@ -18,84 +18,56 @@
 **********************************************************************/

 #include          <signal.h>
+#ifdef __linux__
+#include          <sys/syscall.h>   // For SYS_gettid.
+#include          <unistd.h>        // For syscall itself.
+#endif
+#include          "allheaders.h"
 #include          "errcode.h"
 #include          "tprintf.h"

-/*inT16 global_loc_code = LOC_INIT;//location code
-inT16 global_subloc_code = SUBLOC_NORM;
-                                 //pass2 subloc code
-inT16 global_subsubloc_code = SUBSUBLOC_OTHER;
-                                 //location code
-inT16 global_abort_code = NO_ABORT_CODE;
-                                 //Prog abort code
-*/
-void signal_exit(                 //
-                 int signal_code  //Signal which
-                ) {
-  /*int exit_status;
+// Size of thread-id array of pixes to keep in case of crash.
+const int kMaxNumThreadPixes = 32768;

-  if ((global_loc_code == LOC_PASS2) || (global_loc_code == LOC_FUZZY_SPACE))
-    global_loc_code += global_subloc_code + global_subsubloc_code;
+Pix* global_crash_pixes[kMaxNumThreadPixes];

-  if (signal_code < 0) {
-    exit_status = global_loc_code * 8 + global_abort_code * 2 + 1;
-    tprintf ("Signal_exit %d ABORT. LocCode: %d  AbortCode: %d\n",
-      exit_status, global_loc_code, global_abort_code);
+void SavePixForCrash(int resolution, Pix* pix) {
+#ifdef __linux__
+  int thread_id = syscall(SYS_gettid) % kMaxNumThreadPixes;
+  pixDestroy(&global_crash_pixes[thread_id]);
+  if (pix != NULL) {
+    Pix* clone = pixClone(pix);
+    pixSetXRes(clone, resolution);
+    pixSetYRes(clone, resolution);
+    global_crash_pixes[thread_id] = clone;
  }
-  else {
-    exit_status = global_loc_code * 8 + signal_code * 2;
-    tprintf ("Signal_exit %d SIGNAL ABORT. LocCode: %d  SignalCode: %d\n",
-      exit_status, global_loc_code, signal_code);
-  }
-
-  exit(exit_status);*/
-  exit(signal_code);
+#endif
 }

-
-/*************************************************************************
- * err_exit()
- * All program exits should go through this point. It allows a meaningful status
- * code to be generated for the real exit() call. The status code is made up
- * as follows:
- *  Bit  0    : 1 = Program Abort   0 = System Abort
- *	Bits 1,2  : IF bit 0 = 1 THEN ERRCODE::abort_code
- *				ELSE    0 = Bus Err or Seg Vi
- *								1 = Floating point exception
- *							2 = TimeOut (Signal 15 from command timer)
- *							3 = Any other signal
- *  Bits 3..7 : Location code NEVER 0 !
- *************************************************************************/
-
-//extern "C" {
+// CALL ONLY from a signal handler! Writes a crash image to stderr.
+void signal_exit(int signal_code) {
+  tprintf("Received signal %d!\n", signal_code);
+#ifdef __linux__
+  int thread_id = syscall(SYS_gettid) % kMaxNumThreadPixes;
+  if (global_crash_pixes[thread_id] != NULL) {
+    fprintf(stderr, "Crash caused by image with resolution %d\n",
+            pixGetYRes(global_crash_pixes[thread_id]));
+    fprintf(stderr, "<Cut here>\n");
+    pixWriteStreamPng(stderr, global_crash_pixes[thread_id], 0.0);
+    fprintf(stderr, "\n<End cut>\n");
+  }
+  // Raise an uncaught signal, so as to get a useful stack trace.
+  raise(SIGILL);
+#else
+  abort();
+#endif
+}

 void err_exit() {
-  signal_exit (-1);
+  ASSERT_HOST("Fatal error encountered!" == NULL);
 }


-void signal_termination_handler(int sig) {
-  const ERRCODE SIGNAL_HANDLER_ERR = "Signal_termination_handler called";
-  SIGNAL_HANDLER_ERR.error("signal_termination_handler", ABORT, "Code %d", sig);
-  switch (sig) {
-    case SIGABRT:
-      signal_exit (-1);          //use abort code
-      //         case SIGBUS:
-    case SIGSEGV:
-      signal_exit (0);
-    case SIGFPE:
-      signal_exit (1);           //floating point
-    case SIGTERM:
-      signal_exit (2);           //timeout by cmdtimer
-    default:
-      signal_exit (3);           //Anything else
-  }
-}
-
-
-//};                                                                                                            //end extern "C"
-
-
 void set_global_loc_code(int loc_code) {
  // global_loc_code = loc_code;

--- a/ccutil/globaloc.h
+++ b/ccutil/globaloc.h
@ -22,14 +22,14 @@

 #include          "host.h"

-void signal_exit(                 //
-                 int signal_code  //Signal which
-                );
-//extern "C" {
+// Saves a clone of the given pix, and notes its resolution in thread-specific
+// data, so that the image can be written prior to a crash.
+struct Pix;
+void SavePixForCrash(int resolution, Pix* pix);
+
+void signal_exit(int signal_code);
+
 void err_exit(); 
-                                 //The real signal
-void signal_termination_handler(int sig); 
-//};

 void set_global_loc_code(int loc_code); 

--- a/ccutil/kdpair.h
+++ b/ccutil/kdpair.h
@ -0,0 +1,189 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        kdpair.h
+// Description: Template pair class like STL pair but geared towards
+//              the Key+Data design pattern in which some data needs
+//              to be sorted or kept in a heap sorted on some separate key.
+// Author:      Ray Smith.
+// Created:     Thu Mar 15 14:48:05 PDT 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_KDPAIR_H_
+#define TESSERACT_CCUTIL_KDPAIR_H_
+
+#include "genericvector.h"
+
+namespace tesseract {
+
+// A useful base struct to facilitate the common operation of sorting a vector
+// of simple or smart-pointer data using a separate key. Similar to STL pair.
+template <typename Key, typename Data>
+struct KDPair {
+  KDPair() {}
+  KDPair(Key k, Data d) : data(d), key(k) {}
+
+  int operator==(const KDPair<Key, Data>& other) const {
+    return key == other.key;
+  }
+
+  // WARNING! Keep data as the first element! KDPairInc and KDPairDec depend
+  // on the order of these elements so they can downcast pointers appropriately
+  // for use by GenericHeap::Reshuffle.
+  Data data;
+  Key key;
+};
+// Specialization of KDPair to provide operator< for sorting in increasing order
+// and recasting of data pointers for use with DoublePtr.
+template <typename Key, typename Data>
+struct KDPairInc : public KDPair<Key, Data> {
+  KDPairInc() {}
+  KDPairInc(Key k, Data d) : KDPair<Key, Data>(k, d) {}
+  // Operator< facilitates sorting in increasing order.
+  int operator<(const KDPairInc<Key, Data>& other) const {
+    return this->key < other.key;
+  }
+  // Returns the input Data pointer recast to a KDPairInc pointer.
+  // Just casts a pointer to the first element to a pointer to the whole struct.
+  static KDPairInc* RecastDataPointer(Data* data_ptr) {
+    return reinterpret_cast<KDPairInc*>(data_ptr);
+  }
+};
+// Specialization of KDPair to provide operator< for sorting in decreasing order
+// and recasting of data pointers for use with DoublePtr.
+template <typename Key, typename Data>
+struct KDPairDec : public KDPair<Key, Data> {
+  KDPairDec() {}
+  KDPairDec(Key k, Data d) : KDPair<Key, Data>(k, d) {}
+  // Operator< facilitates sorting in decreasing order by using operator> on
+  // the key values.
+  int operator<(const KDPairDec<Key, Data>& other) const {
+    return this->key > other.key;
+  }
+  // Returns the input Data pointer recast to a KDPairDec pointer.
+  // Just casts a pointer to the first element to a pointer to the whole struct.
+  static KDPairDec* RecastDataPointer(Data* data_ptr) {
+    return reinterpret_cast<KDPairDec*>(data_ptr);
+  }
+};
+
+// A useful base class to facilitate the common operation of sorting a vector
+// of owned pointer data using a separate key. This class owns its data pointer,
+// deleting it when it has finished with it, and providing copy constructor and
+// operator= that have move semantics so that the data does not get copied and
+// only a single instance of KDPtrPair holds a specific data pointer.
+template <typename Key, typename Data>
+class KDPtrPair {
+ public:
+  KDPtrPair() : data_(NULL) {}
+  KDPtrPair(Key k, Data* d) : data_(d), key_(k) {}
+  // Copy constructor steals the pointer from src and NULLs it in src, thereby
+  // moving the (single) ownership of the data.
+  KDPtrPair(KDPtrPair& src) : data_(src.data_), key_(src.key_) {
+    src.data_ = NULL;
+  }
+  // Destructor deletes data, assuming it is the sole owner.
+  ~KDPtrPair() {
+    delete this->data_;
+    this->data_ = NULL;
+  }
+  // Operator= steals the pointer from src and NULLs it in src, thereby
+  // moving the (single) ownership of the data.
+  void operator=(KDPtrPair& src) {
+    delete this->data_;
+    this->data_ = src.data_;
+    src.data_ = NULL;
+    this->key_ = src.key_;
+  }
+
+  int operator==(const KDPtrPair<Key, Data>& other) const {
+    return key_ == other.key_;
+  }
+
+  // Accessors.
+  const Key& key() const {
+    return key_;
+  }
+  void set_key(const Key& new_key) {
+    key_ = new_key;
+  }
+  const Data* data() const {
+    return data_;
+  }
+  // Sets the data pointer, taking ownership of the data.
+  void set_data(Data* new_data) {
+    delete data_;
+    data_ = new_data;
+  }
+  // Relinquishes ownership of the data pointer (setting it to NULL).
+  Data* extract_data() {
+    Data* result = data_;
+    data_ = NULL;
+    return result;
+  }
+
+ private:
+  // Data members are private to keep deletion of data_ encapsulated.
+  Data* data_;
+  Key key_;
+};
+// Specialization of KDPtrPair to provide operator< for sorting in increasing
+// order.
+template <typename Key, typename Data>
+struct KDPtrPairInc : public KDPtrPair<Key, Data> {
+  // Since we are doing non-standard stuff we have to duplicate *all* the
+  // constructors and operator=.
+  KDPtrPairInc() : KDPtrPair<Key, Data>() {}
+  KDPtrPairInc(Key k, Data* d) : KDPtrPair<Key, Data>(k, d) {}
+  KDPtrPairInc(KDPtrPairInc& src) : KDPtrPair<Key, Data>(src) {}
+  void operator=(KDPtrPairInc& src) {
+    KDPtrPair<Key, Data>::operator=(src);
+  }
+  // Operator< facilitates sorting in increasing order.
+  int operator<(const KDPtrPairInc<Key, Data>& other) const {
+    return this->key() < other.key();
+  }
+};
+// Specialization of KDPtrPair to provide operator< for sorting in decreasing
+// order.
+template <typename Key, typename Data>
+struct KDPtrPairDec : public KDPtrPair<Key, Data> {
+  // Since we are doing non-standard stuff we have to duplicate *all* the
+  // constructors and operator=.
+  KDPtrPairDec() : KDPtrPair<Key, Data>() {}
+  KDPtrPairDec(Key k, Data* d) : KDPtrPair<Key, Data>(k, d) {}
+  KDPtrPairDec(KDPtrPairDec& src) : KDPtrPair<Key, Data>(src) {}
+  void operator=(KDPtrPairDec& src) {
+    KDPtrPair<Key, Data>::operator=(src);
+  }
+  // Operator< facilitates sorting in decreasing order by using operator> on
+  // the key values.
+  int operator<(const KDPtrPairDec<Key, Data>& other) const {
+    return this->key() > other.key();
+  }
+};
+
+// Specialization for a pair of ints in increasing order.
+typedef KDPairInc<int, int> IntKDPair;
+
+// Vector of IntKDPair.
+class KDVector : public GenericVector<IntKDPair> {
+  // TODO(rays) Add some code to manipulate a KDVector. For now there
+  // is nothing and this class is effectively a specialization typedef.
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCUTIL_KDPAIR_H_
--- a/ccutil/object_cache.h
+++ b/ccutil/object_cache.h
@ -0,0 +1,125 @@
+///////////////////////////////////////////////////////////////////////
+// File:        object_cache.h
+// Description: A string indexed object cache.
+// Author:      David Eger
+// Created:     Fri Jan 27 12:08:00 PST 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_OBJECT_CACHE_H_
+#define TESSERACT_CCUTIL_OBJECT_CACHE_H_
+
+#include "ccutil.h"
+#include "errcode.h"
+#include "genericvector.h"
+#include "tesscallback.h"
+
+namespace tesseract {
+
+// A simple object cache which maps a string to an object of type T.
+// Usually, these are expensive objects that are loaded from disk.
+// Reference counting is performed, so every Get() needs to be followed later
+// by a Free().  Actual deletion is accomplished by DeleteUnusedObjects().
+template<typename T>
+class ObjectCache {
+ public:
+  ObjectCache() {}
+  ~ObjectCache() {
+    mu_.Lock();
+    for (int i = 0; i < cache_.size(); i++) {
+      if (cache_[i].count > 0) {
+        tprintf("ObjectCache(%p)::~ObjectCache(): WARNING! LEAK! object %p "
+                "still has count %d (id %s)\n",
+                this, cache_[i].object, cache_[i].count,
+                cache_[i].id.string());
+      } else {
+        delete cache_[i].object;
+        cache_[i].object = NULL;
+      }
+    }
+    mu_.Unlock();
+  }
+
+  // Return a pointer to the object identified by id.
+  // If we haven't yet loaded the object, use loader to load it.
+  // If loader fails to load it, record a NULL entry in the cache
+  // and return NULL -- further attempts to load will fail (even
+  // with a different loader) until DeleteUnusedObjects() is called.
+  // We delete the given loader.
+  T *Get(STRING id,
+         TessResultCallback<T *> *loader) {
+    T *retval = NULL;
+    mu_.Lock();
+    for (int i = 0; i < cache_.size(); i++) {
+      if (id == cache_[i].id) {
+        retval = cache_[i].object;
+        if (cache_[i].object != NULL) {
+          cache_[i].count++;
+        }
+        mu_.Unlock();
+        delete loader;
+        return retval;
+      }
+    }
+    cache_.push_back(ReferenceCount());
+    ReferenceCount &rc = cache_.back();
+    rc.id = id;
+    retval = rc.object = loader->Run();
+    rc.count = (retval != NULL) ? 1 : 0;
+    mu_.Unlock();
+    return retval;
+  }
+
+  // Decrement the count for t.
+  // Return whether we knew about the given pointer.
+  bool Free(T *t) {
+    if (t == NULL) return false;
+    mu_.Lock();
+    for (int i = 0; i < cache_.size(); i++) {
+      if (cache_[i].object == t) {
+        --cache_[i].count;
+        mu_.Unlock();
+        return true;
+      }
+    }
+    mu_.Unlock();
+    return false;
+  }
+
+  void DeleteUnusedObjects() {
+    mu_.Lock();
+    for (int i = cache_.size() - 1; i >= 0; i--) {
+      if (cache_[i].count <= 0) {
+        delete cache_[i].object;
+        cache_.remove(i);
+      }
+    }
+    mu_.Unlock();
+  }
+
+ private:
+  struct ReferenceCount {
+    STRING id;  // A unique ID to identify the object (think path on disk)
+    T *object;  // A copy of the object in memory.  Can be delete'd.
+    int count;  // A count of the number of active users of this object.
+  };
+
+  CCUtilMutex mu_;
+  GenericVector<ReferenceCount> cache_;
+};
+
+}  // namespace tesseract
+
+
+#endif  // TESSERACT_CCUTIL_OBJECT_CACHE_H_
--- a/ccutil/params.cpp
+++ b/ccutil/params.cpp
@ -207,4 +207,25 @@ void ParamUtils::PrintParams(FILE *fp, const ParamsVectors *member_params) {
  }
 }

+// Resets all parameters back to default values;
+void ParamUtils::ResetToDefaults(ParamsVectors* member_params) {
+  int v, i;
+  int num_iterations = (member_params == NULL) ? 1 : 2;
+  for (v = 0; v < num_iterations; ++v) {
+    ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params;
+    for (i = 0; i < vec->int_params.size(); ++i) {
+      vec->int_params[i]->ResetToDefault();
+    }
+    for (i = 0; i < vec->bool_params.size(); ++i) {
+      vec->bool_params[i]->ResetToDefault();
+    }
+    for (int i = 0; i < vec->string_params.size(); ++i) {
+      vec->string_params[i]->ResetToDefault();
+    }
+    for (int i = 0; i < vec->double_params.size(); ++i) {
+      vec->double_params[i]->ResetToDefault();
+    }
+  }
+}
+
 }  // namespace tesseract
--- a/ccutil/params.h
+++ b/ccutil/params.h
@ -104,6 +104,9 @@ class ParamUtils {

  // Print parameters to the given file.
  static void PrintParams(FILE *fp, const ParamsVectors *member_params);
+
+  // Resets all parameters back to default values;
+  static void ResetToDefaults(ParamsVectors* member_params);
 };

 // Definition of various parameter types.
@ -142,15 +145,20 @@ class IntParam : public Param {
   IntParam(inT32 value, const char *name, const char *comment, bool init,
            ParamsVectors *vec) : Param(name, comment, init) {
    value_ = value;
+    default_ = value;
    params_vec_ = &(vec->int_params);
    vec->int_params.push_back(this);
  }
  ~IntParam() { ParamUtils::RemoveParam<IntParam>(this, params_vec_); }
  operator inT32() const { return value_; }
  void set_value(inT32 value) { value_ = value; }
+  void ResetToDefault() {
+    value_ = default_;
+  }

 private:
  inT32 value_;
+  inT32 default_;
  // Pointer to the vector that contains this param (not owened by this class).
  GenericVector<IntParam *> *params_vec_;
 };
@ -160,15 +168,20 @@ class BoolParam : public Param {
  BoolParam(bool value, const char *name, const char *comment, bool init,
            ParamsVectors *vec) : Param(name, comment, init) {
    value_ = value;
+    default_ = value;
    params_vec_ = &(vec->bool_params);
    vec->bool_params.push_back(this);
  }
  ~BoolParam() { ParamUtils::RemoveParam<BoolParam>(this, params_vec_); }
  operator BOOL8() const { return value_; }
  void set_value(BOOL8 value) { value_ = value; }
+  void ResetToDefault() {
+    value_ = default_;
+  }

 private:
  BOOL8 value_;
+  BOOL8 default_;
  // Pointer to the vector that contains this param (not owned by this class).
  GenericVector<BoolParam *> *params_vec_;
 };
@ -179,17 +192,23 @@ class StringParam : public Param {
              const char *comment, bool init,
              ParamsVectors *vec) : Param(name, comment, init) {
    value_ = value;
+    default_ = value;
    params_vec_ = &(vec->string_params);
    vec->string_params.push_back(this);
  }
  ~StringParam() { ParamUtils::RemoveParam<StringParam>(this, params_vec_); }
  operator STRING &() { return value_; }
  const char *string() const { return value_.string(); }
+  const char *c_str() const { return value_.string(); }
  bool empty() { return value_.length() <= 0; }
  void set_value(const STRING &value) { value_ = value; }
+  void ResetToDefault() {
+    value_ = default_;
+  }

 private:
  STRING value_;
+  STRING default_;
  // Pointer to the vector that contains this param (not owened by this class).
  GenericVector<StringParam *> *params_vec_;
 };
@ -199,15 +218,20 @@ class DoubleParam : public Param {
  DoubleParam(double value, const char *name, const char *comment,
              bool init, ParamsVectors *vec) : Param(name, comment, init) {
    value_ = value;
+    default_ = value;
    params_vec_ = &(vec->double_params);
    vec->double_params.push_back(this);
  }
  ~DoubleParam() { ParamUtils::RemoveParam<DoubleParam>(this, params_vec_); }
  operator double() const { return value_; }
  void set_value(double value) { value_ = value; }
+  void ResetToDefault() {
+    value_ = default_;
+  }

 private:
  double value_;
+  double default_;
  // Pointer to the vector that contains this param (not owned by this class).
  GenericVector<DoubleParam *> *params_vec_;
 };
--- a/ccutil/platform.h
+++ b/ccutil/platform.h
@ -20,16 +20,12 @@
 #ifndef TESSERACT_CCUTIL_PLATFORM_H__
 #define TESSERACT_CCUTIL_PLATFORM_H__

+#include <string.h>
+
 #define DLLSYM
 #ifdef _WIN32
 #ifdef __GNUC__
 #define ultoa _ultoa
-#ifndef __MINGW32__
-typedef struct _BLOB {
-  unsigned int cbSize;
-  char *pBlobData;
-} BLOB, *LPBLOB;
-#endif  /* __MINGW32__ */
 #endif  /* __GNUC__ */
 #define SIGNED
 #define snprintf _snprintf
@ -71,4 +67,12 @@ typedef struct _BLOB {
    #endif
 #endif

+#if defined(_WIN32) || defined(__CYGWIN__)
+    #define _TESS_FILE_BASENAME_                                            \
+      (strrchr(__FILE__, '\\') ? strrchr(__FILE__, '\\') + 1 : __FILE__)
+#else   // Unices
+    #define _TESS_FILE_BASENAME_                                            \
+      (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#endif
+
 #endif  // TESSERACT_CCUTIL_PLATFORM_H__
--- a/ccutil/sorthelper.h
+++ b/ccutil/sorthelper.h
@ -56,7 +56,9 @@ class SortHelper {
  }

  // Constructor takes a hint of the array size, but it need not be accurate.
-  explicit SortHelper(int sizehint) : counts_(sizehint) {}
+  explicit SortHelper(int sizehint) {
+    counts_.reserve(sizehint);
+  }

  // Add a value that may be a duplicate of an existing value.
  // Uses a linear search.
--- a/ccutil/strngs.cpp
+++ b/ccutil/strngs.cpp
@ -24,8 +24,11 @@

 #include <assert.h>
 // Size of buffer needed to host the decimal representation of the maximum
-// possible length of an int (in 64 bits, being -<20 digits>.
+// possible length of an int (in 64 bits), being -<20 digits>.
 const int kMaxIntSize = 22;
+// Size of buffer needed to host the decimal representation of the maximum
+// possible length of a %.8g being -0.12345678e+999<nul> = 15.
+const int kMaxDoubleSize = 15;

 /**********************************************************************
 * STRING_HEADER provides metadata about the allocated buffer,
@ -163,6 +166,10 @@ const char* STRING::string() const {
  return GetCStr();
 }

+const char* STRING::c_str() const {
+  return string();
+}
+
 /******
 * The STRING_IS_PROTECTED interface adds additional support to migrate
 * code that needs to modify the STRING in ways not otherwise supported
@ -220,6 +227,8 @@ void STRING::erase_range(inT32 index, int len) {

 #else
 void STRING::truncate_at(inT32 index) {
+  ASSERT_HOST(index >= 0);
+  FixHeader();
  char* this_cstr = ensure_cstr(index + 1);
  this_cstr[index] = '\0';
  GetHeader()->used_ = index + 1;
@ -339,6 +348,16 @@ void STRING::add_str_int(const char* str, int number) {
  num_buffer[kMaxIntSize - 1] = '\0';
  *this += num_buffer;
 }
+// Appends the given string and double (as a %.8g) to this.
+void STRING::add_str_double(const char* str, double number) {
+  if (str != NULL)
+    *this += str;
+  // Allow space for the maximum possible length of %8g.
+  char num_buffer[kMaxDoubleSize];
+  snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
+  num_buffer[kMaxDoubleSize - 1] = '\0';
+  *this += num_buffer;
+}

 STRING & STRING::operator=(const char* cstr) {
  STRING_HEADER* this_header = GetHeader();
--- a/ccutil/strngs.h
+++ b/ccutil/strngs.h
@ -55,6 +55,7 @@ class TESS_API STRING
    inT32 length() const;
    inT32 size() const { return length(); }
    const char *string() const;
+    const char *c_str() const;

    inline char* strdup() const {
     inT32 len = length() + 1;
@ -94,8 +95,10 @@ class TESS_API STRING
    // be ambiguous, and ints usually need a string before or between them
    // anyway.
    void add_str_int(const char* str, int number);
+    // Appends the given string and double (as a %.8g) to this.
+    void add_str_double(const char* str, double number);

-    // ensure capcaity but keep pointer encapsulated
+    // ensure capacity but keep pointer encapsulated
    inline void ensure(inT32 min_capacity) { ensure_cstr(min_capacity); }

  private:
--- a/ccutil/tesscallback.h
+++ b/ccutil/tesscallback.h
--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@ -35,6 +35,7 @@ namespace tesseract {
 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
  int i;
  debug_level_ = debug_level;
+  data_file_name_ = data_file_name;
  data_file_ = fopen(data_file_name, "rb");
  if (data_file_ == NULL) {
    tprintf("Error opening data file %s\n", data_file_name);
@ -244,7 +245,7 @@ bool TessdataManager::ExtractToFile(const char *filename) {

  FILE *output_file = fopen(filename, "wb");
  if (output_file == NULL) {
-    tprintf("Error openning %s\n", filename);
+    tprintf("Error opening %s\n", filename);
    exit(1);
  }
  inT64 begin_offset = ftell(GetDataFilePtr());
--- a/ccutil/tessdatamanager.h
+++ b/ccutil/tessdatamanager.h
@ -21,7 +21,9 @@
 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_

 #include <stdio.h>
+
 #include "host.h"
+#include "strngs.h"
 #include "tprintf.h"

 static const char kTrainedDataSuffix[] = "traineddata";
@ -44,7 +46,7 @@ static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
 static const char kShapeTableFileSuffix[] = "shapetable";
 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
-static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
+static const char kParamsModelFileSuffix[] = "params-model";

 namespace tesseract {

@ -59,13 +61,13 @@ enum TessdataType {
  TESSDATA_SYSTEM_DAWG,         // 7
  TESSDATA_NUMBER_DAWG,         // 8
  TESSDATA_FREQ_DAWG,           // 9
-  TESSDATA_FIXED_LENGTH_DAWGS,  // 10
+  TESSDATA_FIXED_LENGTH_DAWGS,  // 10  // deprecated
  TESSDATA_CUBE_UNICHARSET,     // 11
  TESSDATA_CUBE_SYSTEM_DAWG,    // 12
  TESSDATA_SHAPE_TABLE,         // 13
  TESSDATA_BIGRAM_DAWG,         // 14
  TESSDATA_UNAMBIG_DAWG,        // 15
-  TESSDATA_PARAMS_TRAINING_MODEL,  // 16
+  TESSDATA_PARAMS_MODEL,        // 16

  TESSDATA_NUM_ENTRIES
 };
@ -85,13 +87,13 @@ static const char * const kTessdataFileSuffixes[] = {
  kSystemDawgFileSuffix,        // 7
  kNumberDawgFileSuffix,        // 8
  kFreqDawgFileSuffix,          // 9
-  kFixedLengthDawgsFileSuffix,  // 10
+  kFixedLengthDawgsFileSuffix,  // 10  // deprecated
  kCubeUnicharsetFileSuffix,    // 11
  kCubeSystemDawgFileSuffix,    // 12
  kShapeTableFileSuffix,        // 13
  kBigramDawgFileSuffix,        // 14
  kUnambigDawgFileSuffix,       // 15
-  kParamsTrainingModelFileSuffix,  // 16
+  kParamsModelFileSuffix,       // 16
 };

 /**
@ -109,13 +111,13 @@ static const bool kTessdataFileIsText[] = {
  false,                        // 7
  false,                        // 8
  false,                        // 9
-  false,                        // 10
+  false,                        // 10  // deprecated
  true,                         // 11
  false,                        // 12
  false,                        // 13
  false,                        // 14
  false,                        // 15
-  false,                        // 16
+  true,                         // 16
 };

 /**
@ -146,6 +148,9 @@ class TessdataManager {
   */
  bool Init(const char *data_file_name, int debug_level);

+  // Return the name of the underlying data file.
+  const STRING &GetDataFileName() const { return data_file_name_; }
+
  /** Returns data file pointer. */
  inline FILE *GetDataFilePtr() const { return data_file_; }

@ -279,6 +284,7 @@ class TessdataManager {
   * when new tessdata types are introduced.
   */
  inT32 actual_tessdata_num_entries_;
+  STRING data_file_name_;  // name of the data file.
  FILE *data_file_;  ///< pointer to the data file.
  int debug_level_;
  // True if the bytes need swapping.
--- a/ccutil/tprintf.cpp
+++ b/ccutil/tprintf.cpp
@ -24,43 +24,46 @@

 #include          <stdio.h>
 #include          <stdarg.h>
-#include          "strngs.h"
-#include          "params.h"
-#include          "tprintf.h"
 #include          "ccutil.h"
+#include          "params.h"
+#include          "strngs.h"
+#include          "tprintf.h"

 #define MAX_MSG_LEN     65536

 #define EXTERN
-// Since tprintf is protected by a mutex, these parameters can rmain global.
+// Since tprintf is protected by a mutex, these parameters can remain global.
 DLLSYM STRING_VAR(debug_file, "", "File to send tprintf output to");

+DLLSYM INT_VAR(FLAGS_v, 0, "Minimum logging level for tlog() output");
+
 DLLSYM void
-tprintf(                         // Trace printf
-const char *format, ...          // special message
+tprintf_internal(                       // Trace printf
+    const int level,                    // Logging level
+    const char *format, ...             // Message
 ) {
+  if (FLAGS_v < level) return;
  tesseract::tprintfMutex.Lock();
-  va_list args;                  //variable args
-  static FILE *debugfp = NULL;   //debug file
-                                 //debug window
-  inT32 offset = 0;              //into message
+  va_list args;                  // variable args
+  static FILE *debugfp = NULL;   // debug file
+                                 // debug window
+  inT32 offset = 0;              // into message
  static char msg[MAX_MSG_LEN + 1];

-  va_start(args, format);  //variable list
+  va_start(args, format);  // variable list
+  // Format into msg
  #ifdef _WIN32
-                                 //Format into msg
-  offset += _vsnprintf (msg + offset, MAX_MSG_LEN - offset, format, args);
+  offset += _vsnprintf(msg + offset, MAX_MSG_LEN - offset, format, args);
  if (strcmp(debug_file.string(), "/dev/null") == 0)
    debug_file.set_value("nul");
  #else
-                                 //Format into msg
-  offset += vsprintf (msg + offset, format, args);
+  offset += vsnprintf(msg + offset, MAX_MSG_LEN - offset, format, args);
  #endif
  va_end(args);

-  if (debugfp == NULL && strlen (debug_file.string ()) > 0) {
-    debugfp = fopen (debug_file.string (), "wb");
-  } else if (debugfp != NULL && strlen (debug_file.string ()) == 0) {
+  if (debugfp == NULL && strlen(debug_file.string()) > 0) {
+    debugfp = fopen(debug_file.string(), "wb");
+  } else if (debugfp != NULL && strlen(debug_file.string()) == 0) {
    fclose(debugfp);
    debugfp = NULL;
  }
@ -70,46 +73,3 @@ const char *format, ...          // special message
    fprintf(stderr, "%s", msg);
  tesseract::tprintfMutex.Unlock();
 }
-
-
-/*************************************************************************
- * pause_continue()
- * UI for a debugging pause - to see an intermediate state
- * Returns TRUE to continue as normal to the next pause in the current mode;
- * FALSE to quit the current pausing mode.
- *************************************************************************/
-
-DLLSYM BOOL8
-                                 //special message
-pause_continue (const char *format, ...
-) {
-  va_list args;                  //variable args
-  char msg[1000];
-  STRING str = STRING ("DEBUG PAUSE:\n");
-
-  va_start(args, format);  //variable list
-  vsprintf(msg, format, args);  //Format into msg
-  va_end(args);
-
-  #ifdef GRAPHICS_DISABLED
-  // No interaction allowed -> simply go on
-  return true;
-  #else
-
-  #ifdef __UNIX__
-  printf ("%s\n", msg);
-  printf ("Type \"c\" to cancel, anything else to continue: ");
-  char c = getchar ();
-  return (c != 'c');
-  #endif
-
-  #ifdef _WIN32
-  str +=
-    STRING (msg) + STRING ("\nUse OK to continue, CANCEL to stop pausing");
-  //   return AfxMessageBox( str.string(), MB_OKCANCEL ) == IDOK;
-  return::MessageBox (NULL, msg, "IMGAPP",
-    MB_APPLMODAL | MB_OKCANCEL) == IDOK;
-  #endif
-
-  #endif
-}
--- a/ccutil/tprintf.h
+++ b/ccutil/tprintf.h
@ -17,19 +17,29 @@
 *
 **********************************************************************/

-#ifndef           TPRINTF_H
-#define           TPRINTF_H
+#ifndef           TESSERACT_CCUTIL_TPRINTF_H
+#define           TESSERACT_CCUTIL_TPRINTF_H

-#include                   "params.h"
+#include "params.h"

-extern DLLSYM STRING_VAR_H (debug_file, "", "File to send tprintf output to");
-extern DLLSYM BOOL_VAR_H (debug_window_on, TRUE,
-"Send tprintf to window unless file set");
+extern DLLSYM STRING_VAR_H(debug_file, "",
+                           "File to send tprintf output to");
+extern DLLSYM BOOL_VAR_H(debug_window_on, TRUE,
+                         "Send tprintf to window unless file set");

-extern TESS_API void tprintf(            // Trace printf
-const char *format, ...          // special message
-);
-                                 // special message
-DLLSYM BOOL8 pause_continue (const char *format, ...
-);
-#endif
+// Main logging function.
+#define tprintf(args...) tprintf_internal(0, args)
+
+// Variant guarded by the numeric logging level parameter FLAGS_v (default 0).
+// Code using ParseCommandLineFlags() can control its value using the --v
+// commandline argument. Otherwise it must be specified in a config file like
+// other params.
+#define tlog(level, args...) tprintf_internal(level, args)
+
+#define TLOG_IS_ON(level) (FLAGS_v >= level)
+
+extern TESS_API void tprintf_internal(  // Trace printf
+    const int level,                    // Logging level
+    const char *format, ...);           // Message
+
+#endif  // define TESSERACT_CCUTIL_TPRINTF_H
--- a/ccutil/universalambigs.cpp
+++ b/ccutil/universalambigs.cpp
--- a/ccutil/universalambigs.h
+++ b/ccutil/universalambigs.h
@ -0,0 +1,26 @@
+///////////////////////////////////////////////////////////////////////
+// File:        universalambigs.h
+// Description: Data for a universal ambigs file that is useful for
+//              any language.
+// Author:      Ray Smith
+// Created:     Mon Mar 18 11:26:00 PDT 2013
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+namespace tesseract {
+
+extern const char kUniversalAmbigsFile[];
+extern const int ksizeofUniversalAmbigsFile;
+
+}  // namespace tesseract
--- a/configure.ac
+++ b/configure.ac
@ -7,7 +7,7 @@
 # ----------------------------------------

 AC_PREREQ(2.50)
-AC_INIT([tesseract], [3.02.03], [http://code.google.com/p/tesseract-ocr/issues/list])
+AC_INIT([tesseract], [3.03], [http://code.google.com/p/tesseract-ocr/issues/list])
 AC_CONFIG_MACRO_DIR([m4])
 AC_REVISION($Id: configure.ac,v 1.4 2007/02/02 22:38:17 theraysmith Exp $)
 AC_CONFIG_AUX_DIR(config)
@ -18,7 +18,7 @@ AC_PREFIX_DEFAULT(/usr/local)
 # documentation.
 # TODO(luc) Generate good documentation using doxygen or equivalent
 PACKAGE_YEAR=2013
-PACKAGE_DATE="07/03"
+PACKAGE_DATE="08/13"

 AC_DEFINE_UNQUOTED(PACKAGE_NAME,["${PACKAGE_NAME}"],[Name of package])
 AC_DEFINE_UNQUOTED(PACKAGE_VERSION,["${PACKAGE_VERSION}"],[Version number])
@ -34,8 +34,8 @@ GENERIC_LIBRARY_NAME=tesseract

 # Release versioning
 GENERIC_MAJOR_VERSION=3
-GENERIC_MINOR_VERSION=2
-GENERIC_MICRO_VERSION=3
+GENERIC_MINOR_VERSION=3
+GENERIC_MICRO_VERSION=0

 # API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)
 GENERIC_API_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION
@ -248,6 +248,7 @@ AC_HEADER_TIME
 AC_HEADER_SYS_WAIT
 AC_CHECK_HEADERS(sys/ipc.h sys/shm.h)
 AC_CHECK_HEADERS(limits.h malloc.h)
+AC_CHECK_HEADERS(allheaders.h)
 # Enable use of system-defined bool type if available:
 AC_HEADER_STDBOOL

@ -261,6 +262,7 @@ AC_SYS_LARGEFILE
 # ----------------------------------------

 AC_CHECK_TYPES(wchar_t)
+AC_CHECK_TYPES(long long int)
 AC_CHECK_TYPES(mbstate_t,,,[#include "wchar.h"])

 # ----------------------------------------
--- a/cube/char_set.cpp
+++ b/cube/char_set.cpp
@ -65,13 +65,13 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
      !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) {
    fprintf(stderr, "Cube ERROR (CharSet::Create): could not find "
            "either cube or tesseract unicharset\n");
-    return false;
+    return NULL;
  }
  FILE *charset_fp = tessdata_manager->GetDataFilePtr();
  if (!charset_fp) {
    fprintf(stderr, "Cube ERROR (CharSet::Create): could not load "
            "a unicharset\n");
-    return false;
+    return NULL;
  }

  // If we found a cube unicharset separate from tesseract's, load it and
@ -90,7 +90,7 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
  }
  if (!loaded) {
    delete char_set;
-    return false;
+    return NULL;
  }

  char_set->init_ = true;
--- a/cube/conv_net_classifier.cpp
+++ b/cube/conv_net_classifier.cpp
@ -234,8 +234,8 @@ bool ConvNetCharClassifier::LoadFoldingSets(const string &data_file_path,
  fclose(fp);

  string fold_sets_str;
-  if (!CubeUtils::ReadFileToString(fold_file_name.c_str(),
-                                  &fold_sets_str)) {
+  if (!CubeUtils::ReadFileToString(fold_file_name,
+                                   &fold_sets_str)) {
    return false;
  }

@ -327,7 +327,7 @@ bool ConvNetCharClassifier::LoadNets(const string &data_file_path,
  fclose(fp);

  // load main net
-  char_net_ = tesseract::NeuralNet::FromFile(char_net_file.c_str());
+  char_net_ = tesseract::NeuralNet::FromFile(char_net_file);
  if (char_net_ == NULL) {
    fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadNets): "
            "could not load %s\n", char_net_file.c_str());
--- a/cube/cube_line_segmenter.cpp
+++ b/cube/cube_line_segmenter.cpp
@ -124,7 +124,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,

  if (line_con_comps == NULL) {
    delete []lines_pixa;
-    return false;
+    return NULL;
  }

  // assign each conn comp to the a line based on its centroid
@ -142,7 +142,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
        delete []lines_pixa;
        boxaDestroy(&line_con_comps);
        pixaDestroy(&line_con_comps_pix);
-        return false;
+        return NULL;
      }
    }

@ -413,14 +413,14 @@ Pix *CubeLineSegmenter::Pixa2Pix(Pixa *pixa, Box **dest_box,

  (*dest_box) = boxCreate(min_x, min_y, max_x - min_x, max_y - min_y);
  if ((*dest_box) == NULL) {
-    return false;
+    return NULL;
  }

  // create the union pix
  Pix *union_pix = pixCreate((*dest_box)->w, (*dest_box)->h, img_->d);
  if (union_pix == NULL) {
    boxDestroy(dest_box);
-    return false;
+    return NULL;
  }

  // create a pix corresponding to the union of all pixs
--- a/cube/cube_object.cpp
+++ b/cube/cube_object.cpp
@ -165,7 +165,7 @@ WordAltList *CubeObject::Recognize(LangModel *lang_mod, bool word_mode) {
      if (deslanted_beam_obj_ == NULL) {
        fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not "
                "construct deslanted BeamSearch\n");
-        return false;
+        return NULL;
      }
    }

--- a/cube/hybrid_neural_net_classifier.cpp
+++ b/cube/hybrid_neural_net_classifier.cpp
@ -230,8 +230,8 @@ bool HybridNeuralNetCharClassifier::LoadFoldingSets(
  fclose(fp);

  string fold_sets_str;
-  if (!CubeUtils::ReadFileToString(fold_file_name.c_str(),
-                                  &fold_sets_str)) {
+  if (!CubeUtils::ReadFileToString(fold_file_name,
+                                   &fold_sets_str)) {
    return false;
  }

@ -323,7 +323,7 @@ bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path,
  fclose(fp);

  string str;
-  if (!CubeUtils::ReadFileToString(hybrid_net_file.c_str(), &str)) {
+  if (!CubeUtils::ReadFileToString(hybrid_net_file, &str)) {
    return false;
  }

@ -348,7 +348,7 @@ bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path,
    }
    // load the net
    string net_file_name = data_file_path + tokens_vec[0];
-    nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name.c_str());
+    nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name);
    if (nets_[net_idx] == NULL) {
      return false;
    }
--- a/cube/tess_lang_mod_edge.cpp
+++ b/cube/tess_lang_mod_edge.cpp
@ -107,7 +107,7 @@ int TessLangModEdge::CreateChildren(CubeRecoContext *cntxt,
                                    LangModEdge **edge_array) {
  int edge_cnt = 0;
  NodeChildVector vec;
-  dawg->unichar_ids_of(parent_node, &vec);  // find all children of the parent
+  dawg->unichar_ids_of(parent_node, &vec, false);  // find all children
  for (int i = 0; i < vec.size(); ++i) {
    const NodeChild &child = vec[i];
    if (child.unichar_id == INVALID_UNICHAR_ID) continue;
--- a/cube/word_list_lang_model.cpp
+++ b/cube/word_list_lang_model.cpp
@ -74,7 +74,7 @@ LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
  // initialize if necessary
  if (init_ == false) {
    if (Init() == false) {
-      return false;
+      return NULL;
    }
  }

@ -92,7 +92,7 @@ LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
    // advance node
    edge_ref = dawg_->next_node(edge_ref);
    if (edge_ref == 0) {
-      return 0;
+      return NULL;
    }
  }

--- a/cutil/Makefile.am
+++ b/cutil/Makefile.am
@ -8,7 +8,7 @@ endif
 noinst_HEADERS = \
    bitvec.h callcpp.h const.h cutil.h cutil_class.h danerror.h efio.h \
    emalloc.h freelist.h globals.h listio.h \
-    oldheap.h oldlist.h structures.h tessarray.h
+    oldlist.h structures.h

 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_cutil.la
@ -22,7 +22,7 @@ endif

 libtesseract_cutil_la_SOURCES = \
    bitvec.cpp callcpp.cpp cutil.cpp cutil_class.cpp danerror.cpp efio.cpp \
-    emalloc.cpp freelist.cpp listio.cpp oldheap.cpp \
-    oldlist.cpp structures.cpp tessarray.cpp
+    emalloc.cpp freelist.cpp listio.cpp \
+    oldlist.cpp structures.cpp


--- a/cutil/bitvec.cpp
+++ b/cutil/bitvec.cpp
@ -73,27 +73,6 @@ void FreeBitVector(BIT_VECTOR BitVector) {
 }                                /* FreeBitVector */


-/**
- * hamming_distance(array1,array2,length) computes the hamming distance
- * between two bit strings.
- */
-/*--------------------------------------------------------------------------*/
-int hamming_distance(uinT32* array1, uinT32* array2, int length) {
-  register uinT32 diff;   /*bit difference */
-  register int dist;             /*total distance */
-
-  dist = 0;
-  for (; length > 0; length--) {
-    diff = *array1++ ^ *array2++;/*different bits */
-    while (diff) {
-      diff &= diff - 1;          /*lose a bit */
-      dist++;
-    }
-  }
-  return dist;                   /*total distance */
-}
-
-
 /*---------------------------------------------------------------------------*/
 /**
 * Allocate and return a new bit vector large enough to
--- a/cutil/bitvec.h
+++ b/cutil/bitvec.h
@ -70,8 +70,6 @@ BIT_VECTOR ExpandBitVector(BIT_VECTOR Vector, int NewNumBits);

 void FreeBitVector(BIT_VECTOR BitVector);

-int hamming_distance(uinT32* array1, uinT32* array2, int length);
-
 BIT_VECTOR NewBitVector(int NumBits);

 #endif
--- a/cutil/danerror.cpp
+++ b/cutil/danerror.cpp
@ -53,5 +53,5 @@ void DoError(int Error, const char *Message) {
    tprintf("\nError: %s!\n", Message);
  }

-  signal_termination_handler(Error);
+  err_exit();
 }                                /* DoError */
--- a/cutil/listio.cpp
+++ b/cutil/listio.cpp
@ -46,7 +46,6 @@ LIST read_list(const char *filename) {
  FILE *infile;
  char s[CHARS_PER_LINE];
  LIST list;
-  char *chopAt250();

  if ((infile = open_file (filename, "r")) == NULL)
    return (NIL_LIST);
--- a/cutil/oldheap.cpp
+++ b/cutil/oldheap.cpp
@ -1,334 +0,0 @@
-/******************************************************************************
- **	Filename:	heap.c
- **	Purpose:	Routines for managing heaps (smallest at root)
- **	Author:		Dan Johnson
- **	History:	3/13/89, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
-----------------------------------------------------------------------------*/
-#include "oldheap.h"
-#include "freelist.h"
-#include "danerror.h"
-#include "emalloc.h"
-#include <stdio.h>
-
-#define FATHER(N) ((N)>>1)
-#define LEFTSON(N)  ((N)<<1)
-#define RIGHTSON(N) ((N)<<1 + 1)
-
-/*-----------------------------------------------------------------------------
-              Public Code
-----------------------------------------------------------------------------*/
-/*---------------------------------------------------------------------------*/
-/**
- * This routine creates and initializes a new heap data
- * structure containing Size elements.  In actuality, Size + 1
- * elements are allocated.  The first element, element 0, is
- * unused, this makes the index arithmetic easier.
- *
- * Globals:
- * - None
- *
- * @param Size maximum number of entries in the heap
- * @return Pointer to the new heap.
- * @note Exceptions: None
- * @note History: 3/13/89, DSJ, Created.
- */
-HEAP *MakeHeap(int Size) {
-  HEAP *NewHeap;
-
-  NewHeap = (HEAP *) Emalloc (sizeof (HEAP) + Size * sizeof (HEAPENTRY));
-
-  NewHeap->Size = Size;
-  NewHeap->FirstFree = 1;
-  return (NewHeap);
-}                                /* MakeHeap */
-
-
-/*---------------------------------------------------------------------------*/
-/**
- * This routine removes the top item on the heap and places
- * its contents into Key and Data.
- *
- * Globals:
- * - None
- *
- * @param Heap ptr to heap whose top is to be removed and returned
- * @param Key place to put key of top heap item
- * @param out_ptr place to put data of top heap item
- *
- * @return OK if top entry returned, EMPTY if heap is empty
- * @note Exceptions: None
- * @note History: 5/10/91, DSJ, Created (Modified from GetTopOfHeap).
- */
-int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr) {
-  inT32 Hole;
-  FLOAT32 HoleKey;
-  inT32 Son;
-  void **Data = (void **) out_ptr;
-
-  if (Heap->FirstFree <= 1)
-    return (EMPTY);
-
-  *Key = Heap->Entry[1].Key;
-  *Data = Heap->Entry[1].Data;
-
-  Heap->FirstFree--;
-
-  /* imagine the hole at the root is filled with the last entry in the heap */
-  HoleKey = Heap->Entry[Heap->FirstFree].Key;
-  Hole = 1;
-
-                                 /* while hole has 2 sons */
-  while ((Son = LEFTSON (Hole)) < Heap->FirstFree) {
-    /* find the son with the smallest key */
-    if (Heap->Entry[Son].Key > Heap->Entry[Son + 1].Key)
-      Son++;
-
-    /* if key for hole is greater than key for son, sift hole down */
-    if (HoleKey > Heap->Entry[Son].Key) {
-      Heap->Entry[Hole].Key = Heap->Entry[Son].Key;
-      Heap->Entry[Hole].Data = Heap->Entry[Son].Data;
-      Hole = Son;
-    }
-    else
-      break;
-  }
-  Heap->Entry[Hole].Key = HoleKey;
-  Heap->Entry[Hole].Data = Heap->Entry[Heap->FirstFree].Data;
-  return (TESS_HEAP_OK);
-}                                /* HeapPop */
-
-
-/**
- * HeapPopWorst
- *
- * Remove the largest item from the heap.
- *
- * @param Heap ptr to heap whose top is to be removed and returned
- * @param Key place to put key of top heap item
- * @param out_ptr place to put data of top heap item
- */
-int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr) {
-  inT32 Index;                   /*current index */
-  inT32 Hole;
-  FLOAT32 HoleKey;
-  inT32 Father;
-  void *HoleData;
-  void **Data = (void **) out_ptr;
-
-  if (Heap->FirstFree <= 1)
-    return (EMPTY);
-
-  HoleKey = Heap->Entry[1].Key;
-  Hole = 1;
-  Heap->FirstFree--;
-  for (Index = Heap->FirstFree, Father = FATHER (Index); Index > Father;
-    Index--)
-  if (Heap->Entry[Index].Key > HoleKey) {
-                                 /*find biggest */
-    HoleKey = Heap->Entry[Index].Key;
-    Hole = Index;
-  }
-  *Key = HoleKey;
-  *Data = Heap->Entry[Hole].Data;
-
-  HoleKey = Heap->Entry[Heap->FirstFree].Key;
-  Heap->Entry[Hole].Key = HoleKey;
-  HoleData = Heap->Entry[Heap->FirstFree].Data;
-  Heap->Entry[Hole].Data = HoleData;
-
-  /* now sift last entry to its rightful place */
-  Father = FATHER (Hole);        /*father of hole */
-  while (Hole > 1 && Heap->Entry[Father].Key > HoleKey) {
-                                 /*swap entries */
-    Heap->Entry[Hole].Key = Heap->Entry[Father].Key;
-    Heap->Entry[Hole].Data = Heap->Entry[Father].Data;
-    Heap->Entry[Father].Data = HoleData;
-    Heap->Entry[Father].Key = HoleKey;
-    Hole = Father;
-    Father = FATHER (Hole);
-  }
-  return (TESS_HEAP_OK);
-}                                /* HeapPop */
-
-
-// Pushes data onto the heap only if there is free space left.
-// Returns true if data was added to the heap, false if the heap was full.
-bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data) {
-  if (Heap->FirstFree > Heap->Size) return false;
-  HeapPush(Heap, Key, Data);
-  return true;
-}
-
-/*---------------------------------------------------------------------------*/
-/** 
- * This routine stores Data into Heap and associates it
- * with Key.  The heap is
- * maintained in such a way that the item with the lowest key
- * is always at the top of the heap.
- *
- * Globals:
- * - None
- *
- * @param Heap ptr to heap to store new item in
- * @param Key numeric key associated with new item
- * @param Data ptr to data contents of new item
- *
- * @note Exceptions:
- * - HEAPFULL error if heap size is exceeded
- *
- * @note History: 5/10/91, DSJ, Created (Modified version of HeapStore).
- */
-void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data) {
-  inT32 Item;
-  inT32 Father;
-
-  if (Heap->FirstFree > Heap->Size)
-    DoError (HEAPFULL, "Heap size exceeded");
-
-  Item = Heap->FirstFree;
-  Heap->FirstFree++;
-  while (Item != 1) {
-    Father = FATHER (Item);
-    if (Heap->Entry[Father].Key > Key) {
-      Heap->Entry[Item].Key = Heap->Entry[Father].Key;
-      Heap->Entry[Item].Data = Heap->Entry[Father].Data;
-      Item = Father;
-    }
-    else
-      break;
-  }
-  Heap->Entry[Item].Key = Key;
-  Heap->Entry[Item].Data = Data;
-}                                /* HeapPush */
-
-
-/*---------------------------------------------------------------------------*/
-/**
- * This routine stores Entry into Heap.  The heap is
- * maintained in such a way that the item with the lowest key
- * is always at the top of the heap.
- *
- * Globals:
- * - None
- *
- * @param Heap ptr to heap to store new item in
- * @param Entry ptr to item to be stored in Heap
- * @note Exceptions:
- * - HEAPFULL error if heap size is exceeded
- * @note History: 3/13/89, DSJ, Created.
- */
-void HeapStore(HEAP *Heap, HEAPENTRY *Entry) {
-  inT32 Item;
-  inT32 Father;
-
-  if (Heap->FirstFree > Heap->Size)
-    DoError (HEAPFULL, "Heap size exceeded");
-
-  Item = Heap->FirstFree;
-  Heap->FirstFree++;
-  while (Item != 1) {
-    Father = FATHER (Item);
-    if (Heap->Entry[Father].Key > Entry->Key) {
-      Heap->Entry[Item].Key = Heap->Entry[Father].Key;
-      Heap->Entry[Item].Data = Heap->Entry[Father].Data;
-      Item = Father;
-    }
-    else
-      break;
-  }
-  Heap->Entry[Item].Key = Entry->Key;
-  Heap->Entry[Item].Data = Entry->Data;
-}                                /* HeapStore */
-
-
-/*---------------------------------------------------------------------------*/
-/**
- * This routine removes the top item on the heap and copies its
- * contents into Entry.
- *
- * @param Heap ptr to heap whose top is to be removed and returned
- * @param Entry ptr to heap entry to be filled with top entry on Heap
- *
- * Globals:
- * - None
- *
- * @return OK if top entry returned, EMPTY if heap is empty
- * @note Exceptions: None
- * @note History: 3/13/89, DSJ, Created.
- */
-int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry) {
-  inT32 Hole;
-  FLOAT32 HoleKey;
-  inT32 Son;
-
-  if (Heap->FirstFree <= 1)
-    return (EMPTY);
-
-  Entry->Key = Heap->Entry[1].Key;
-  Entry->Data = Heap->Entry[1].Data;
-
-  Heap->FirstFree--;
-
-  /* imagine the hole at the root is filled with the last entry in the heap */
-  HoleKey = Heap->Entry[Heap->FirstFree].Key;
-  Hole = 1;
-
-                                 /* while hole has 2 sons */
-  while ((Son = LEFTSON (Hole)) < Heap->FirstFree) {
-    /* find the son with the smallest key */
-    if (Heap->Entry[Son].Key > Heap->Entry[Son + 1].Key)
-      Son++;
-
-    /* if key for hole is greater than key for son, sift hole down */
-    if (HoleKey > Heap->Entry[Son].Key) {
-      Heap->Entry[Hole].Key = Heap->Entry[Son].Key;
-      Heap->Entry[Hole].Data = Heap->Entry[Son].Data;
-      Hole = Son;
-    }
-    else
-      break;
-  }
-  Heap->Entry[Hole].Key = HoleKey;
-  Heap->Entry[Hole].Data = Heap->Entry[Heap->FirstFree].Data;
-  return (TESS_HEAP_OK);
-}                                /* GetTopOfHeap */
-
-
-/*---------------------------------------------------------------------------*/
-/**
- * This routine is similar to FreeHeap in that it
- * deallocates the memory consumed by the heap.  However, it
- * also calls Deallocator for each item in the heap so that
- * this data is also deallocated.
- *
- * @param Heap heap whose data is to be freed
- * @param destructor function to be used to deallocate data
- *
- * Globals: 
- * - None
- *
- * @note Exceptions: none
- * @note History: Tue May 15 08:52:04 1990, DSJ, Created.
- */
-void FreeHeapData(HEAP *Heap, void_dest destructor) {
-  HEAPENTRY Entry;
-
-  while (GetTopOfHeap (Heap, &Entry) != EMPTY)
-    destructor (Entry.Data);
-
-  FreeHeap(Heap);
-}                                /* FreeHeapData */
--- a/cutil/oldheap.h
+++ b/cutil/oldheap.h
@ -1,80 +0,0 @@
-/******************************************************************************
- **	Filename:	heap.h
- **	Purpose:	Definition of heap access routines.
- **	Author:		Dan Johnson
- **	History:	3/13/89, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#ifndef   HEAP_H
-#define   HEAP_H
-
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
-----------------------------------------------------------------------------*/
-#include "host.h"
-#include "cutil.h"
-
-#define HEAPFULL      3000
-
-#define EMPTY -1
-#define TESS_HEAP_OK   0
-
-struct HEAPENTRY {
-  FLOAT32 Key;
-  void *Data;
-};
-
-struct HEAP {
-  inT32 Size;
-  inT32 FirstFree;
-  HEAPENTRY Entry[1];
-};
-
-/*-----------------------------------------------------------------------------
-            Macros
-----------------------------------------------------------------------------*/
-#define           FreeHeap(H) memfree(H)
-#define           MaxSizeOfHeap(H)  (H->Size)
-#define           SizeOfHeap(H)   (H->FirstFree - 1)
-#define           InitHeap(H)   (H->FirstFree = 1)
-#define           HeapFull(H)   ((H)->FirstFree > (H)->Size)
-#define           HeapEmpty(H)    ((H)->FirstFree <= 1)
-
-/* macros for accessing elements in heap by index.  The indicies vary from
-  0 to SizeOfHeap-1.  No bounds checking is done.  Elements accessed in
-  this manner are in random order relative to the Key values.  These
-  macros should never be used as the LHS of an assignment statement as this
-  will corrupt the heap.*/
-#define           HeapKeyFor(H,E)   ((H)->Entry[(E)+1].Key)
-#define           HeapDataFor(H,E)  ((H)->Entry[(E)+1].Data)
-
-/*-----------------------------------------------------------------------------
-          Public Function Prototypes
-----------------------------------------------------------------------------*/
-HEAP *MakeHeap(int Size);
-
-int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr);
-
-int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr);
-
-void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data);
-
-void HeapStore(HEAP *Heap, HEAPENTRY *Entry);
-
-int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry);
-
-void FreeHeapData(HEAP *Heap, void_dest destructor);
-
-bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data);
-
-#endif
--- a/cutil/tessarray.cpp
+++ b/cutil/tessarray.cpp
@ -1,115 +0,0 @@
-/* -*-C-*-
-################################################################################
-#
-# File:						array.c
-# Description:				Dynamic Array of Strings
-# Author:					Mark Seaman, Software Productivity
-# Created:					Thu Jul 23 13:24:09 1987
-# Modified:     Wed Mar  6 15:18:33 1991 (Mark Seaman) marks@hpgrlt
-# Language:					C
-# Package:					N/A
-# Status:					Reusable Software Component
-#
-# (c) Copyright 1987, Hewlett-Packard Company.
-** Licensed under the Apache License, Version 2.0 (the "License");
-** you may not use this file except in compliance with the License.
-** You may obtain a copy of the License at
-** http://www.apache.org/licenses/LICENSE-2.0
-** Unless required by applicable law or agreed to in writing, software
-** distributed under the License is distributed on an "AS IS" BASIS,
-** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-** See the License for the specific language governing permissions and
-** limitations under the License.
-#
-################################################################################
-
-This file contains the implentations of a set of dynamic array of string
-manipulation routines.		For the interface definitions and documentation
-of these routines see the file "das.h".
-
-***************************************************************************/
-
-#include "tessarray.h"
-#include "callcpp.h"
-#include "freelist.h"
-
-#include <stdio.h>
-#include <string.h>
-#ifdef _WIN32
-#include <process.h>
-#endif
-#include <ctype.h>
-#if MAC_OR_DOS
-#include <stdlib.h>
-#endif
-
-/**********************************************************************
- * array_insert
- *
- * Insert a data element into a particular spot in the array.  Move all
- * the elements in the array (past that spot) down one to make room for
- * the new element.
- **********************************************************************/
-ARRAY array_insert(ARRAY array, int index, void *value) { 
-  int x;
-
-  array = array_push (array, NULL);
-  for (x = array_count (array) - 1; x > index; x--)
-    array_value (array, x) = array_value (array, x - 1);
-  array_value (array, index) = value;
-  return (array);
-}
-
-
-/**********************************************************************
- * array_new
- *
- * Create a new array with a certain number of elements.	If the number
- * of elements requested is 0 then the default number will be used.
- **********************************************************************/
-ARRAY array_new(int num) { 
-  ARRAY temp;
-  int x;
-
-  if (num == 0)
-    num = DEFAULT_SIZE;
-  temp = (ARRAY) memalloc ((num - 2) * sizeof (char *) +
-    sizeof (struct array_record));
-  if (!temp) {
-    cprintf ("error: Out of memory in array_new\n");
-    exit (1);                    //?err_exit ();
-  }
-  array_count (temp) = 0;
-  array_limit (temp) = num;
-  for (x = 0; x < num; x++)
-    array_value (temp, x) = (char *) 0;
-  return (temp);
-}
-
-
-/**********************************************************************
- * array_push
- *
- * Add a new element onto the top of the array.	If there is not room
- * more room is made by "realloc"ing the array.	This means that the
- * new array location may change.  All previous references to its old
- * location may no longer be valid.
- **********************************************************************/
-ARRAY array_push(ARRAY array, void *value) { 
-  if (array_count (array) == array_limit (array)) {
-    array = (ARRAY) memrealloc (array, (array_limit (array) * 2 - 2) *
-      sizeof (char *) +
-      sizeof (struct array_record),
-      (array_limit (array) -
-      2) * sizeof (char *) +
-      sizeof (struct array_record));
-    if (!array) {
-      cprintf ("error: Out of memory in array_push\n");
-      exit (1);                  //?err_exit ();
-    }
-    array_limit (array) *= 2;
-  }
-  array_count (array)++;
-  array_top (array) = value;
-  return (array);
-}
--- a/cutil/tessarray.h
+++ b/cutil/tessarray.h
@ -1,166 +0,0 @@
-/* -*-C-*-
- ********************************************************************************
- *
- * File:        array.h  (Formerly array.h)
- * Description:  Dynamic Array of String
- * Author:       Mark Seaman, SW Productivity
- * Created:      Fri Oct 16 14:37:00 1987
- * Modified:     Mon Sep 24 14:15:59 1990 (Mark Seaman) marks@hpgrlt
- * Language:     C
- * Package:      N/A
- * Status:       Reusable Software Component
- *
- * (c) Copyright 1987, Hewlett-Packard Company.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- *****************************************************************************
-
-This file contains a set of general purpose dynamic array of string routines.
-These routines can be used in a wide variety of ways to provide several
-different popular data structures. A new "das" can be created by declaring
-a variable of type 'DAS'
-******************************************************************************/
-
-#ifndef TESSARRAY_H
-#define TESSARRAY_H
-
-/*
----------------------------------------------------------------------
-              I n c l u d e s
----------------------------------------------------------------------
-*/
-
-#include <stdio.h>
-
-/*
----------------------------------------------------------------------
-              T y p e s
----------------------------------------------------------------------
-*/
-
-typedef struct array_record
-{
-  size_t limit;
-  size_t top;
-  void *base[2];
-} *ARRAY;
-
-typedef void (*voidProc) ();
-
-typedef int (*intProc) ();
-
-/*
----------------------------------------------------------------------
-              M a c r o s
----------------------------------------------------------------------
-*/
-
-#define DEFAULT_SIZE 2
-
-/**********************************************************************
- * array_count
- *
- * Return the value of the number of elements currently in the array.
- **********************************************************************/
-
-#define array_count(a)  \
-((a)->top)
-
-/**********************************************************************
- * array_free
- *
- * Free the memory allocated to this array.
- **********************************************************************/
-
-#define array_free  \
-memfree
-
-/**********************************************************************
- * array_index
- *
- * Check to make sure that the index value is valid. Return the
- * value of the nth element currently in the array.
- **********************************************************************/
-
-#define array_index(a,i)   \
-((i<array_count(a)) ? (a)->base[i] : 0)
-
-/**********************************************************************
- * array_limit
- *
- * Return the maximum number of elements that could be currently held
- * in this array without further expansion.
- **********************************************************************/
-
-#define array_limit(a)     \
-((a)->limit)
-
-/**********************************************************************
- * array_loop
- *
- * Iterate through each of the array elements.  Each value can then be
- * accessed by:
- *    array_index (a, x)
- **********************************************************************/
-
-#define array_loop(a,x)    \
-for (x=0; x < array_count (a); x++)
-
-/**********************************************************************
- * array_top
- *
- * Return the last element that was pushed on this array.
- **********************************************************************/
-
-#define array_top(a)       \
-((a)->base[array_count (a) - 1])
-
-/**********************************************************************
- * array_value
- *
- * Return the nth element of the array.  Don't do range checking.
- **********************************************************************/
-
-#define array_value(a,i)   \
-((a)->base[i])
-
-/*----------------------------------------------------------------------
-              F u n c t i o n s
----------------------------------------------------------------------*/
-ARRAY array_insert(ARRAY array, int index, void *value);
-
-ARRAY array_new(int num);
-
-ARRAY array_push(ARRAY array, void *value);
-
-/*
-#if defined(__STDC__) || defined(__cplusplus)
-# define	_ARGS(s) s
-#else
-# define	_ARGS(s) ()
-#endif*/
-
-/* array.c
-ARRAY array_insert
-  _ARGS((ARRAY array,
-  int index,
-  char *value));
-
-ARRAY array_new
-  _ARGS((int num));
-
-ARRAY array_push
-  _ARGS((ARRAY array,
-  char *value));
-
-#undef _ARGS
-*/
-#endif
--- a/dict/Makefile.am
+++ b/dict/Makefile.am
@ -7,8 +7,8 @@ AM_CPPFLAGS += -DTESS_EXPORTS \
 endif

 noinst_HEADERS = \
-    dawg.h dict.h matchdefs.h \
-    permute.h states.h stopper.h trie.h
+    dawg.h dawg_cache.h dict.h matchdefs.h \
+    stopper.h trie.h

 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_dict.la
@ -25,7 +25,7 @@ endif

 libtesseract_dict_la_SOURCES = \
    context.cpp \
-    dawg.cpp dict.cpp hyphen.cpp \
-    permdawg.cpp permute.cpp states.cpp stopper.cpp trie.cpp
+    dawg.cpp dawg_cache.cpp dict.cpp hyphen.cpp \
+    permdawg.cpp stopper.cpp trie.cpp


--- a/dict/dawg.cpp
+++ b/dict/dawg.cpp
@ -38,6 +38,7 @@
 #include "freelist.h"
 #include "helpers.h"
 #include "strngs.h"
+#include "tesscallback.h"
 #include "tprintf.h"

 /*----------------------------------------------------------------------
@ -45,25 +46,29 @@
 ----------------------------------------------------------------------*/
 namespace tesseract {

-bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
-  if (word.length() == 0) return false;
+bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
+                          bool requires_complete) const {
+  if (word.length() == 0) return !requires_complete;
  NODE_REF node = 0;
  int end_index = word.length() - 1;
-  for (int i = 0; i <= end_index; i++) {
-    if (debug_level_ > 1) {
-      tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node);
-      print_node(node, MAX_NODE_EDGES_DISPLAY);
-      tprintf("\n");
+  for (int i = 0; i < end_index; i++) {
+    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
+    if (edge == NO_EDGE) {
+      return false;
    }
-    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index);
-    if (edge != NO_EDGE) {
-      node = next_node(edge);
-      if (node == 0) node = NO_EDGE;
-    } else {
+    if ((node = next_node(edge)) == 0) {
+      // This only happens if all words following this edge terminate --
+      // there are no larger words.  See Trie::add_word_to_dawg()
      return false;
    }
  }
-  return true;
+  // Now check the last character.
+  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
+      NO_EDGE;
+}
+
+bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
+  return prefix_in_dawg(word, true);
 }

 int Dawg::check_for_words(const char *filename,
@ -99,23 +104,36 @@ int Dawg::check_for_words(const char *filename,
 }

 void Dawg::iterate_words(const UNICHARSET &unicharset,
-                         TessCallback1<const char *> *cb) const {
+                         TessCallback1<const WERD_CHOICE *> *cb) const {
  WERD_CHOICE word(&unicharset);
  iterate_words_rec(word, 0, cb);
 }

+void CallWithUTF8(TessCallback1<const char *> *cb, const WERD_CHOICE *wc) {
+  STRING s;
+  wc->string_and_lengths(&s, NULL);
+  cb->Run(s.string());
+}
+
+void Dawg::iterate_words(const UNICHARSET &unicharset,
+                         TessCallback1<const char *> *cb) const {
+  TessCallback1<const WERD_CHOICE *> *shim =
+      NewPermanentTessCallback(CallWithUTF8, cb);
+  WERD_CHOICE word(&unicharset);
+  iterate_words_rec(word, 0, shim);
+  delete shim;
+}
+
 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
                             NODE_REF to_explore,
-                             TessCallback1<const char *> *cb) const {
+                             TessCallback1<const WERD_CHOICE *> *cb) const {
  NodeChildVector children;
-  this->unichar_ids_of(to_explore, &children);
+  this->unichar_ids_of(to_explore, &children, false);
  for (int i = 0; i < children.size(); i++) {
    WERD_CHOICE next_word(word_so_far);
    next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
    if (this->end_of_word(children[i].edge_ref)) {
-      STRING s;
-      next_word.string_and_lengths(&s, NULL);
-      cb->Run(s.string());
+      cb->Run(&next_word);
    }
    NODE_REF next = next_node(children[i].edge_ref);
    if (next != 0) {
@ -132,7 +150,7 @@ bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
    bool any_matched = false;
    NodeChildVector vec;
-    this->unichar_ids_of(node, &vec);
+    this->unichar_ids_of(node, &vec, false);
    for (int i = 0; i < vec.size(); ++i) {
      word->set_unichar_id(vec[i].unichar_id, index);
      if (match_words(word, index, node, wildcard))
--- a/dict/dawg.h
+++ b/dict/dawg.h
@ -91,10 +91,6 @@ enum DawgType {
 #define NUM_FLAG_BITS          3
 #define REFFORMAT "%lld"

-// Set kBeginningDawgsType[i] to true if a Dawg of
-// DawgType i can contain the beginning of a word.
-static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
-
 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
  { 0, 1, 1, 0 },  // for DAWG_TYPE_PUNCTUATION
  { 1, 0, 0, 0 },  // for DAWG_TYPE_WORD
@ -137,12 +133,21 @@ class Dawg {
  /// Returns true if the given word is in the Dawg.
  bool word_in_dawg(const WERD_CHOICE &word) const;

+  // Returns true if the given word prefix is not contraindicated by the dawg.
+  // If requires_complete is true, then the exact complete word must be present.
+  bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
+
  /// Checks the Dawg for the words that are listed in the requested file.
  /// Returns the number of words in the given file missing from the Dawg.
  int check_for_words(const char *filename,
                      const UNICHARSET &unicharset,
                      bool enable_wildcard) const;

+  // For each word in the Dawg, call the given (permanent) callback with the
+  // text (UTF-8) version of the word.
+  void iterate_words(const UNICHARSET &unicharset,
+                     TessCallback1<const WERD_CHOICE *> *cb) const;
+
  // For each word in the Dawg, call the given (permanent) callback with the
  // text (UTF-8) version of the word.
  void iterate_words(const UNICHARSET &unicharset,
@ -156,7 +161,8 @@ class Dawg {

  /// Fills the given NodeChildVector with all the unichar ids (and the
  /// corresponding EDGE_REFs) for which there is an edge out of this node.
-  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
+  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+                              bool word_end) const = 0;

  /// Returns the next node visited by following the edge
  /// indicated by the given EDGE_REF.
@ -277,7 +283,7 @@ class Dawg {
  // Recursively iterate over all words in a dawg (see public iterate_words).
  void iterate_words_rec(const WERD_CHOICE &word_so_far,
                         NODE_REF to_explore,
-                         TessCallback1<const char *> *cb) const;
+                         TessCallback1<const WERD_CHOICE *> *cb) const;

  // Member Variables.
  DawgType type_;
@ -299,22 +305,71 @@ class Dawg {
 };

 //
-/// DawgInfo struct and DawgInfoVector class are used for
-/// storing information about the current Dawg search state.
+// DawgPosition keeps track of where we are in the primary dawg we're searching
+// as well as where we may be in the "punctuation dawg" which may provide
+// surrounding context.
 //
-struct DawgInfo {
-  DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
-  DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
-  bool operator==(const DawgInfo &other) {
-    return (this->dawg_index == other.dawg_index && this->ref == other.ref);
+// Example:
+//   punctuation dawg  -- space is the "pattern character"
+//     " "     // no punctuation
+//     "' '"   // leading and trailing apostrophes
+//     " '"    // trailing apostrophe
+//   word dawg:
+//     "cat"
+//     "cab"
+//     "cat's"
+//
+//  DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
+//
+//  DawgPosition(-1, NO_EDGE, p, pe, false)
+//    We're in the punctuation dawg, no other dawg has been started.
+//    (1) If there's a pattern edge as a punc dawg child of us,
+//        for each punc-following dawg starting with ch, produce:
+//        Result: DawgPosition(k, w, p', false)
+//    (2) If there's a valid continuation in the punc dawg, produce:
+//        Result: DawgPosition(-k, NO_EDGE, p', false)
+//
+//  DawgPosition(k, w, -1, NO_EDGE, false)
+//    We're in dawg k.  Going back to punctuation dawg is not an option.
+//    Follow ch in dawg k.
+//
+//  DawgPosition(k, w, p, pe, false)
+//    We're in dawg k.  Continue in dawg k and/or go back to the punc dawg.
+//    If ending, check that the punctuation dawg is also ok to end here.
+//
+//  DawgPosition(k, w, p, pe true)
+//    We're back in the punctuation dawg.  Continuing there is the only option.
+struct DawgPosition {
+  DawgPosition()
+      : dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
+        back_to_punc(false) {}
+  DawgPosition(int dawg_idx, EDGE_REF dawgref,
+               int punc_idx, EDGE_REF puncref,
+               bool backtopunc)
+      : dawg_index(dawg_idx), dawg_ref(dawgref),
+        punc_index(punc_idx), punc_ref(puncref),
+        back_to_punc(backtopunc) {
  }
-  int dawg_index;
-  EDGE_REF ref;
+  bool operator==(const DawgPosition &other) {
+    return dawg_index == other.dawg_index &&
+        dawg_ref == other.dawg_ref &&
+        punc_index == other.punc_index &&
+        punc_ref == other.punc_ref &&
+        back_to_punc == other.back_to_punc;
+  }
+
+  inT8 dawg_index;
+  EDGE_REF dawg_ref;
+  inT8 punc_index;
+  EDGE_REF punc_ref;
+  // Have we returned to the punc dawg at the end of the word?
+  bool back_to_punc;
 };
-class DawgInfoVector : public GenericVector<DawgInfo> {
+
+class DawgPositionVector : public GenericVector<DawgPosition> {
 public:
  /// Overload destructor, since clear() does not delete data_[] any more.
-  ~DawgInfoVector() {
+  ~DawgPositionVector() {
    if (size_reserved_ > 0) {
      delete[] data_;
      size_used_ = 0;
@ -327,15 +382,17 @@ class DawgInfoVector : public GenericVector<DawgInfo> {
  /// Adds an entry for the given dawg_index with the given node to the vec.
  /// Returns false if the same entry already exists in the vector,
  /// true otherwise.
-  inline bool add_unique(const DawgInfo &new_info, bool debug,
+  inline bool add_unique(const DawgPosition &new_pos,
+                         bool debug,
                         const char *debug_msg) {
    for (int i = 0; i < size_used_; ++i) {
-      if (data_[i] == new_info) return false;
+      if (data_[i] == new_pos) return false;
    }
-    push_back(new_info);
+    push_back(new_pos);
    if (debug) {
-      tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
-              new_info.dawg_index, new_info.ref);
+      tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
+              debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
+              new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
    }
    return true;
  }
@ -385,12 +442,15 @@ class SquishedDawg : public Dawg {

  /// Fills the given NodeChildVector with all the unichar ids (and the
  /// corresponding EDGE_REFs) for which there is an edge out of this node.
-  void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
+  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+                      bool word_end) const {
    EDGE_REF edge = node;
    if (!edge_occupied(edge) || edge == NO_EDGE) return;
    assert(forward_edge(edge));  // we don't expect any backward edges to
    do {                         // be present when this funciton is called
-      vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
+      if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
+        vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
+      }
    } while (!last_edge(edge++));
  }

--- a/dict/dawg_cache.cpp
+++ b/dict/dawg_cache.cpp
@ -0,0 +1,102 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dawg_cache.h
+// Description: A class that knows about loading and caching dawgs.
+// Author:      David Eger
+// Created:     Fri Jan 27 12:08:00 PST 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dawg_cache.h"
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+struct DawgLoader {
+  DawgLoader(const STRING &lang,
+             const char *data_file_name,
+             TessdataType tessdata_dawg_type,
+             int dawg_debug_level)
+      : lang_(lang),
+        data_file_name_(data_file_name),
+        tessdata_dawg_type_(tessdata_dawg_type),
+        dawg_debug_level_(dawg_debug_level) {}
+
+  Dawg *Load();
+
+  STRING lang_;
+  const char *data_file_name_;
+  TessdataType tessdata_dawg_type_;
+  int dawg_debug_level_;
+};
+
+Dawg *DawgCache::GetSquishedDawg(
+    const STRING &lang,
+    const char *data_file_name,
+    TessdataType tessdata_dawg_type,
+    int debug_level) {
+  STRING data_id = data_file_name;
+  data_id += kTessdataFileSuffixes[tessdata_dawg_type];
+  DawgLoader loader(lang, data_file_name, tessdata_dawg_type, debug_level);
+  return dawgs_.Get(data_id, NewTessCallback(&loader, &DawgLoader::Load));
+}
+
+Dawg *DawgLoader::Load() {
+  TessdataManager data_loader;
+  if (!data_loader.Init(data_file_name_, dawg_debug_level_)) {
+    return NULL;
+  }
+  if (!data_loader.SeekToStart(tessdata_dawg_type_)) return NULL;
+  FILE *fp = data_loader.GetDataFilePtr();
+  DawgType dawg_type;
+  PermuterType perm_type;
+  switch (tessdata_dawg_type_) {
+    case TESSDATA_PUNC_DAWG:
+      dawg_type = DAWG_TYPE_PUNCTUATION;
+      perm_type = PUNC_PERM;
+      break;
+    case TESSDATA_SYSTEM_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = SYSTEM_DAWG_PERM;
+      break;
+    case TESSDATA_NUMBER_DAWG:
+      dawg_type = DAWG_TYPE_NUMBER;
+      perm_type = NUMBER_PERM;
+      break;
+    case TESSDATA_BIGRAM_DAWG:
+      dawg_type = DAWG_TYPE_WORD;  // doesn't actually matter
+      perm_type = COMPOUND_PERM;   // doesn't actually matter
+      break;
+    case TESSDATA_UNAMBIG_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = SYSTEM_DAWG_PERM;
+      break;
+    case TESSDATA_FREQ_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = FREQ_DAWG_PERM;
+      break;
+    default:
+      data_loader.End();
+      return NULL;
+  }
+  SquishedDawg *retval =
+      new SquishedDawg(fp, dawg_type, lang_, perm_type, dawg_debug_level_);
+  data_loader.End();
+  return retval;
+}
+
+}  // namespace tesseract
--- a/dict/dawg_cache.h
+++ b/dict/dawg_cache.h
@ -0,0 +1,56 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dawg_cache.h
+// Description: A class that knows about loading and caching dawgs.
+// Author:      David Eger
+// Created:     Fri Jan 27 12:08:00 PST 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_DICT_DAWG_CACHE_H_
+#define TESSERACT_DICT_DAWG_CACHE_H_
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+class DawgCache {
+ public:
+  Dawg *GetSquishedDawg(
+      const STRING &lang,
+      const char *data_file_name,
+      TessdataType tessdata_dawg_type,
+      int debug_level);
+
+  // If we manage the given dawg, decrement its count,
+  // and possibly delete it if the count reaches zero.
+  // If dawg is unknown to us, return false.
+  bool FreeDawg(Dawg *dawg) {
+    return dawgs_.Free(dawg);
+  }
+
+  // Free up any currently unused dawgs.
+  void DeleteUnusedDawgs() {
+    dawgs_.DeleteUnusedObjects();
+  }
+
+ private:
+  ObjectCache<Dawg> dawgs_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_DICT_DAWG_CACHE_H_
--- a/Show More
+++ b/Show More