Major refactor of control.cpp to enable line recognition

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1147 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 09:52:40 +08:00 · 2014-08-11 23:23:06 +00:00 · 2014-08-11 23:23:06 +00:00 · dbf6197471
commit dbf6197471
parent e249d7bcb2
34 changed files with 931 additions and 560 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -790,6 +790,10 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
 * Runs page layout analysis in the mode set by SetPageSegMode.
 * May optionally be called prior to Recognize to get access to just
 * the page layout results. Returns an iterator to the results.
+ * If merge_similar_words is true, words are combined where suitable for use
+ * with a line recognizer. Use if you want to use AnalyseLayout to find the
+ * textlines, and then want to process textline fragments with an external
+ * line recognizer.
 * Returns NULL on error or an empty page.
 * The returned iterator must be deleted after use.
 * WARNING! This class points to data held within the TessBaseAPI class, and
@ -797,11 +801,11 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
 * DetectOS, or anything else that changes the internal PAGE_RES.
 */
-PageIterator* TessBaseAPI::AnalyseLayout() {
+PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
  if (FindLines() == 0) {
    if (block_list_->empty())
      return NULL;  // The page was empty.
-    page_res_ = new PAGE_RES(block_list_, NULL);
+    page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
    DetectParagraphs(false);
    return new PageIterator(
        page_res_, tesseract_, thresholder_->GetScaleFactor(),
@ -823,18 +827,22 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
  if (page_res_ != NULL)
    delete page_res_;
  if (block_list_->empty()) {
-    page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
+    page_res_ = new PAGE_RES(false, block_list_,
+                             &tesseract_->prev_word_best_choice_);
    return 0; // Empty page.
  }

  tesseract_->SetBlackAndWhitelist();
  recognition_done_ = true;
-  if (tesseract_->tessedit_resegment_from_line_boxes)
+  if (tesseract_->tessedit_resegment_from_line_boxes) {
    page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
-  else if (tesseract_->tessedit_resegment_from_boxes)
+  } else if (tesseract_->tessedit_resegment_from_boxes) {
    page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
-  else
-    page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
+  } else {
+    // TODO(rays) LSTM here.
+    page_res_ = new PAGE_RES(false,
+                             block_list_, &tesseract_->prev_word_best_choice_);
+  }
  if (tesseract_->tessedit_make_boxes_from_boxes) {
    tesseract_->CorrectClassifyWords(page_res_);
    return 0;
@ -900,7 +908,8 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {

  recognition_done_ = true;

-  page_res_ = new PAGE_RES(block_list_, &(tesseract_->prev_word_best_choice_));
+  page_res_ = new PAGE_RES(false, block_list_,
+                           &(tesseract_->prev_word_best_choice_));

  PAGE_RES_IT page_res_it(page_res_);

@ -1977,7 +1986,10 @@ void TessBaseAPI::Threshold(Pix** pix) {
    // than over-estimate resolution.
    thresholder_->SetSourceYResolution(kMinCredibleResolution);
  }
-  thresholder_->ThresholdToPix(pix);
+  PageSegMode pageseg_mode =
+      static_cast<PageSegMode>(
+          static_cast<int>(tesseract_->tessedit_pageseg_mode));
+  thresholder_->ThresholdToPix(pageseg_mode, pix);
  thresholder_->GetImageSizes(&rect_left_, &rect_top_,
                              &rect_width_, &rect_height_,
                              &image_width_, &image_height_);
@ -2332,7 +2344,7 @@ void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,


 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
-  PAGE_RES *page_res = new PAGE_RES(block_list,
+  PAGE_RES *page_res = new PAGE_RES(false, block_list,
                                    &(tesseract_->prev_word_best_choice_));
  tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
  return page_res;
@ -2341,7 +2353,7 @@ PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
                                        PAGE_RES* pass1_result) {
  if (!pass1_result)
-    pass1_result = new PAGE_RES(block_list,
+    pass1_result = new PAGE_RES(false, block_list,
                                &(tesseract_->prev_word_best_choice_));
  tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
  return pass1_result;
--- a/api/baseapi.h
+++ b/api/baseapi.h
@ -484,14 +484,21 @@ class TESS_API TessBaseAPI {
   * Runs page layout analysis in the mode set by SetPageSegMode.
   * May optionally be called prior to Recognize to get access to just
   * the page layout results. Returns an iterator to the results.
-   * Returns NULL on error.
+   * If merge_similar_words is true, words are combined where suitable for use
+   * with a line recognizer. Use if you want to use AnalyseLayout to find the
+   * textlines, and then want to process textline fragments with an external
+   * line recognizer.
+   * Returns NULL on error or an empty page.
   * The returned iterator must be deleted after use.
   * WARNING! This class points to data held within the TessBaseAPI class, and
   * therefore can only be used while the TessBaseAPI class still exists and
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
   * DetectOS, or anything else that changes the internal PAGE_RES.
   */
-  PageIterator* AnalyseLayout();
+  PageIterator* AnalyseLayout() {
+    return AnalyseLayout(false);
+  }
+  PageIterator* AnalyseLayout(bool merge_similar_words);

  /**
   * Recognize the image from SetAndThresholdImage, generating Tesseract
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -110,30 +110,20 @@ static void clear_any_old_text(BLOCK_LIST *block_list) {
 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
                                bool find_segmentation,
                                BLOCK_LIST *block_list) {
-  int box_count = 0;
-  int box_failures = 0;
-
-  FILE* box_file = OpenBoxFile(fname);
-  TBOX box;
  GenericVector<TBOX> boxes;
  GenericVector<STRING> texts, full_texts;
-
-  bool found_box = true;
-  while (found_box) {
-    int line_number = 0;           // Line number of the box file.
-    STRING text, full_text;
-    found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
-    if (found_box) {
-      ++box_count;
-      MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
-    } else {
-      full_text = "";
-    }
-    boxes.push_back(box);
-    texts.push_back(text);
-    full_texts.push_back(full_text);
+  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
+                    NULL)) {
+    return NULL;  // Can't do it.
  }

+  int box_count = boxes.size();
+  int box_failures = 0;
+  // Add an empty everything to the end.
+  boxes.push_back(TBOX());
+  texts.push_back(STRING());
+  full_texts.push_back(STRING());
+
  // In word mode, we use the boxes to make a word for each box, but
  // in blob mode we use the existing words and maximally chop them first.
  PAGE_RES* page_res = find_segmentation ?
@ -239,7 +229,7 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
      }
    }
  }
-  PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
+  PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
  PAGE_RES_IT pr_it(page_res);
  WERD_RES* word_res;
  while ((word_res = pr_it.word()) != NULL) {
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@ -69,16 +69,11 @@ const double kMinRefitXHeightFraction = 0.5;
 namespace tesseract {
 void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
                                  TBOX &selection_box) {
-  WERD *word;
-  ROW *pseudo_row;               // row of word
-  BLOCK *pseudo_block;           // block of word
-
-  word = make_pseudo_word(page_res, selection_box,
-                          pseudo_block, pseudo_row);
-  if (word != NULL) {
-    WERD_RES word_res(word);
-    recog_interactive(pseudo_block, pseudo_row, &word_res);
-    delete word;
+  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
+  if (it != NULL) {
+    recog_interactive(it);
+    it->DeleteCurrentWord();
+    delete it;
  }
 }

@ -92,19 +87,22 @@ void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
 * @param row row of word
 * @param word_res word to recognise
 */
-BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) {
+BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
  inT16 char_qual;
  inT16 good_char_qual;

-  WordData word_data(block, row, word_res);
+  WordData word_data(*pr_it);
  SetupWordPassN(2, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass2, &word_data);
+  classify_word_and_language(&Tesseract::classify_word_pass2, pr_it,
+                             &word_data);
  if (tessedit_debug_quality_metrics) {
-    word_char_quality(word_res, row, &char_qual, &good_char_qual);
-    tprintf
-      ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
-      word_res->reject_map.length(), word_blob_quality(word_res, row),
-      word_outline_errs(word_res), char_qual, good_char_qual);
+    WERD_RES* word_res = pr_it->word();
+    word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
+    tprintf("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
+            "char_quality: %d; good_char_quality: %d\n",
+            word_res->reject_map.length(),
+            word_blob_quality(word_res, pr_it->row()->row),
+            word_outline_errs(word_res), char_qual, good_char_qual);
  }
  return TRUE;
 }
@ -163,8 +161,6 @@ void Tesseract::SetupAllWordsPassN(int pass_n,
  PAGE_RES_IT page_res_it(page_res);
  for (page_res_it.restart_page(); page_res_it.word() != NULL;
       page_res_it.forward()) {
-    if (pass_n == 1)
-      page_res_it.word()->SetupFake(unicharset);
    if (target_word_box == NULL ||
        ProcessTargetWord(page_res_it.word()->word->bounding_box(),
                          *target_word_box, word_config, 1)) {
@ -180,33 +176,29 @@ void Tesseract::SetupAllWordsPassN(int pass_n,

 // Sets up the single word ready for whichever engine is to be run.
 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
-  if (pass_n == 1 || !word->word->done || tessedit_training_tess) {
-    if (pass_n == 2) {
+  if (pass_n == 1 || !word->word->done) {
+    if (pass_n == 1) {
+      word->word->SetupForRecognition(unicharset, this, BestPix(),
+                                      tessedit_ocr_engine_mode, NULL,
+                                      classify_bln_numeric_mode,
+                                      textord_use_cjk_fp_model,
+                                      poly_allow_detailed_fx,
+                                      word->row, word->block);
+    } else if (pass_n == 2) {
      // TODO(rays) Should we do this on pass1 too?
      word->word->caps_height = 0.0;
      if (word->word->x_height == 0.0f)
        word->word->x_height = word->row->x_height();
    }
-    // Cube doesn't get setup for pass2.
-    if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
-      word->word->SetupForRecognition(
-            unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
-            classify_bln_numeric_mode, textord_use_cjk_fp_model,
-            poly_allow_detailed_fx, word->row, word->block);
-    }
-  }
-  if (!sub_langs_.empty()) {
-    if (word->lang_words.size() != sub_langs_.size()) {
-      // Setup the words for all the sub-languages now.
-      WERD_RES empty;
-      word->lang_words.init_to_size(sub_langs_.size(), empty);
-    }
-    for (int s = 0; s < sub_langs_.size(); ++s) {
-      Tesseract* lang_t = sub_langs_[s];
-      if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY &&
-          (!word->lang_words[s].done || lang_t->tessedit_training_tess))) {
-        word->lang_words[s].InitForRetryRecognition(*word->word);
-        word->lang_words[s].SetupForRecognition(
+    for (int s = 0; s <= sub_langs_.size(); ++s) {
+      // The sub_langs_.size() entry is for the master language.
+      Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
+      WERD_RES* word_res = new WERD_RES;
+      word_res->InitForRetryRecognition(*word->word);
+      word->lang_words.push_back(word_res);
+      // Cube doesn't get setup for pass2.
+      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
+        word_res->SetupForRecognition(
              lang_t->unicharset, lang_t, BestPix(),
              lang_t->tessedit_ocr_engine_mode, NULL,
              lang_t->classify_bln_numeric_mode,
@ -217,17 +209,19 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
  }
 }

-
 // Runs word recognition on all the words.
 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
+                                   PAGE_RES_IT* pr_it,
                                   GenericVector<WordData>* words) {
  // TODO(rays) Before this loop can be parallelized (it would yield a massive
  // speed-up) all remaining member globals need to be converted to local/heap
  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
  // added. The results will be significantly different with adaption on, and
  // deterioration will need investigation.
+  pr_it->restart_page();
  for (int w = 0; w < words->size(); ++w) {
    WordData* word = &(*words)[w];
+    if (w > 0) word->prev_word = &(*words)[w - 1];
    if (monitor != NULL) {
      monitor->ocr_alive = TRUE;
      if (pass_n == 1)
@ -244,16 +238,26 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
        return false;
      }
    }
-    if (word->word->tess_failed) continue;
+    if (word->word->tess_failed) {
+      int s;
+      for (s = 0; s < word->lang_words.size() &&
+           word->lang_words[s]->tess_failed; ++s) {}
+      // If all are failed, skip it. Image words are skipped by this test.
+      if (s > word->lang_words.size()) continue;
+    }
+    // Sync pr_it with the wth WordData.
+    while (pr_it->word() != NULL && pr_it->word() != word->word)
+      pr_it->forward();
+    ASSERT_HOST(pr_it->word() != NULL);
    WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
                                            : &Tesseract::classify_word_pass2;
-    classify_word_and_language(recognizer, word);
+    classify_word_and_language(recognizer, pr_it, word);
    if (tessedit_dump_choices) {
-      word_dumper(NULL, word->row, word->word);
      tprintf("Pass%d: %s [%s]\n", pass_n,
              word->word->best_choice->unichar_string().string(),
              word->word->best_choice->debug_string().string());
    }
+    pr_it->forward();
  }
  return true;
 }
@ -326,12 +330,12 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,

    most_recently_used_ = this;
    // Run pass 1 word recognition.
-    if (!RecogAllWordsPassN(1, monitor, &words)) return false;
+    if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
    // Pass 1 post-processing.
-    while (page_res_it.word() != NULL) {
+    for (page_res_it.restart_page(); page_res_it.word() != NULL;
+         page_res_it.forward()) {
      if (page_res_it.word()->word->flag(W_REP_CHAR)) {
        fix_rep_char(&page_res_it);
-        page_res_it.forward();
        continue;
      }

@ -346,15 +350,14 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
        page_res->misadaption_log.push_back(
            page_res_it.word()->blamer_bundle->misadaption_debug());
      }
-
-      page_res_it.forward();
    }
  }

  if (dopasses == 1) return true;

  // ****************** Pass 2 *******************
-  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) {
+  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
+      tessedit_ocr_engine_mode != OEM_CUBE_ONLY ) {
    page_res_it.restart_page();
    GenericVector<WordData> words;
    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
@ -363,17 +366,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
    }
    most_recently_used_ = this;
    // Run pass 2 word recognition.
-    if (!RecogAllWordsPassN(2, monitor, &words)) return false;
-    // Pass 2 post-processing.
-    while (page_res_it.word() != NULL) {
-      WERD_RES* word = page_res_it.word();
-       if (word->word->flag(W_REP_CHAR) && !word->done) {
-        fix_rep_char(&page_res_it);
-        page_res_it.forward();
-        continue;
-      }
-      page_res_it.forward();
-    }
+    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
  }

  // The next passes can only be run if tesseract has been used, as cube
@ -407,8 +400,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
    // ****************** Pass 9 *******************
    // Check the correctness of the final results.
    blamer_pass(page_res);
+    script_pos_pass(page_res);
  }
-  script_pos_pass(page_res);

  // Write results pass.
  set_global_loc_code(LOC_WRITE_RESULTS);
@ -745,166 +738,232 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
  }
 }

-// Helper returns true if the new_word is better than the word, using a
-// simple test of better certainty AND rating (to reduce false positives
-// from cube) or a dictionary vs non-dictionary word.
-static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word,
-                          double rating_ratio,
-                          double certainty_margin) {
-  if (new_word.best_choice == NULL) {
-    return false;  // New one no good.
+// Factored helper considers the indexed word and updates all the pointed
+// values.
+static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
+                         float* rating, float* certainty, bool* bad,
+                         bool* valid_permuter, int* right, int* next_left) {
+  *right = -MAX_INT32;
+  *next_left = MAX_INT32;
+  if (index < words.size()) {
+    WERD_CHOICE* choice = words[index]->best_choice;
+    if (choice == NULL) {
+      *bad = true;
+    } else {
+      *rating += choice->rating();
+      *certainty = MIN(*certainty, choice->certainty());
+      if (!Dict::valid_word_permuter(choice->permuter(), false))
+        *valid_permuter = false;
+    }
+    *right = words[index]->word->bounding_box().right();
+    if (index + 1 < words.size())
+      *next_left = words[index + 1]->word->bounding_box().left();
+  } else {
+    *valid_permuter = false;
+    *bad = true;
  }
-  if (word.best_choice == NULL) {
-    return true;  // Old one no good.
+}
+
+// Helper chooses the best combination of words, transferring good ones from
+// new_words to best_words. To win, a new word must have (better rating and
+// certainty) or (better permuter status and rating within rating ratio and
+// certainty within certainty margin) than current best.
+// All the new_words are consumed (moved to best_words or deleted.)
+// The return value is the number of new_words used minus the number of
+// best_words that remain in the output.
+static int SelectBestWords(double rating_ratio,
+                           double certainty_margin,
+                           bool debug,
+                           PointerVector<WERD_RES>* new_words,
+                           PointerVector<WERD_RES>* best_words) {
+  // Process the smallest groups of words that have an overlapping word
+  // boundary at the end.
+  GenericVector<WERD_RES*> out_words;
+  // Index into each word vector (best, new).
+  int b = 0, n = 0;
+  int num_best = 0, num_new = 0;
+  while (b < best_words->size() || n < new_words->size()) {
+    // Start of the current run in each.
+    int start_b = b, start_n = n;
+    // Rating of the current run in each.
+    float b_rating = 0.0f, n_rating = 0.0f;
+    // Certainty of the current run in each.
+    float b_certainty = 0.0f, n_certainty = 0.0f;
+    // True if any word is missing its best choice.
+    bool b_bad = false, n_bad = false;
+    // True if all words have a valid permuter.
+    bool b_valid_permuter = true, n_valid_permuter = true;
+
+    while (b < best_words->size() || n < new_words->size()) {
+      int b_right = -MAX_INT32;
+      int next_b_left = MAX_INT32;
+      EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
+                   &b_valid_permuter, &b_right, &next_b_left);
+      int n_right = -MAX_INT32;
+      int next_n_left = MAX_INT32;
+      EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
+                   &n_valid_permuter, &n_right, &next_n_left);
+      if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
+        // The word breaks overlap. [start_b,b] and [start_n, n] match.
+        break;
+      }
+      // Keep searching for the matching word break.
+      if ((b_right < n_right && b < best_words->size()) ||
+          n == new_words->size())
+        ++b;
+      else
+        ++n;
+    }
+    bool new_better = false;
+    if (!n_bad && (b_bad || (n_certainty > b_certainty &&
+                             n_rating < b_rating) ||
+                            (!b_valid_permuter && n_valid_permuter &&
+                             n_rating < b_rating * rating_ratio &&
+                             n_certainty > b_certainty - certainty_margin))) {
+      // New is better.
+      for (int i = start_n; i <= n; ++i) {
+        out_words.push_back((*new_words)[i]);
+        (*new_words)[i] = NULL;
+        ++num_new;
+      }
+      new_better = true;
+    } else if (!b_bad) {
+      // Current best is better.
+      for (int i = start_b; i <= b; ++i) {
+        out_words.push_back((*best_words)[i]);
+        (*best_words)[i] = NULL;
+        ++num_best;
+      }
+    }
+    int end_b = b < best_words->size() ? b + 1 : b;
+    int end_n = n < new_words->size() ? n + 1 : n;
+    if (debug) {
+      tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
+              " valid dict: %d v %d\n",
+              end_n - start_n, new_better ? "better" : "worse",
+              end_b - start_b, n_rating, b_rating,
+              n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
+    }
+    // Move on to the next group.
+    b = end_b;
+    n = end_n;
  }
-  if (new_word.best_choice->certainty() > word.best_choice->certainty() &&
-      new_word.best_choice->rating() < word.best_choice->rating()) {
-    return true;  // New word has better confidence.
-  }
-  if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
-      Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
-      new_word.best_choice->rating() <
-          word.best_choice->rating() * rating_ratio &&
-      new_word.best_choice->certainty() >
-          word.best_choice->certainty() - certainty_margin) {
-    return true;  // New word is from a dictionary.
-  }
-  return false;  // New word is no better.
+  // Transfer from out_words to best_words.
+  best_words->clear();
+  for (int i = 0; i < out_words.size(); ++i)
+    best_words->push_back(out_words[i]);
+  return num_new - num_best;
 }

 // Helper to recognize the word using the given (language-specific) tesseract.
-// Returns true if the result was better than previously.
-bool Tesseract::RetryWithLanguage(const WERD_RES& best_word,
-                                  WordData* word_data, WERD_RES* word,
-                                  WordRecognizer recognizer) {
-  if (classify_debug_level || cube_debug_level) {
-    tprintf("Retrying word using lang %s, oem %d\n",
+// Returns positive if this recognizer found more new best words than the
+// number kept from best_words.
+int Tesseract::RetryWithLanguage(const WordData& word_data,
+                                 WordRecognizer recognizer,
+                                 WERD_RES** in_word,
+                                 PointerVector<WERD_RES>* best_words) {
+  bool debug = classify_debug_level || cube_debug_level;
+  if (debug) {
+    tprintf("Trying word using lang %s, oem %d\n",
            lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
  }
  // Run the recognizer on the word.
+  PointerVector<WERD_RES> new_words;
+  (this->*recognizer)(word_data, in_word, &new_words);
+  if (new_words.empty()) {
+    // Transfer input word to new_words, as the classifier must have put
+    // the result back in the input.
+    new_words.push_back(*in_word);
+    *in_word = NULL;
+  }
+  if (debug) {
+    for (int i = 0; i < new_words.size(); ++i)
+      new_words[i]->DebugTopChoice("Lang result");
+  }
  // Initial version is a bit of a hack based on better certainty and rating
  // (to reduce false positives from cube) or a dictionary vs non-dictionary
  // word.
-  (this->*recognizer)(word_data, word);
-  bool new_is_better = NewWordBetter(best_word, *word,
-                                     classify_max_rating_ratio,
-                                     classify_max_certainty_margin);
-  if (classify_debug_level || cube_debug_level) {
-    if (word->best_choice == NULL) {
-      tprintf("NULL result %s better!\n",
-              new_is_better ? "IS" : "NOT");
-    } else {
-      tprintf("New result %s better:%s, r=%g, c=%g\n",
-              new_is_better ? "IS" : "NOT",
-              word->best_choice->unichar_string().string(),
-              word->best_choice->rating(),
-              word->best_choice->certainty());
-    }
+  return SelectBestWords(classify_max_rating_ratio,
+                         classify_max_certainty_margin,
+                         debug, &new_words, best_words);
+}
+
+// Helper returns true if all the words are acceptable.
+static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
+  for (int w = 0; w < words.size(); ++w) {
+    if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
  }
-  return new_is_better;
+  return true;
 }

 // Generic function for classifying a word. Can be used either for pass1 or
 // pass2 according to the function passed to recognizer.
-// word block and row are the current location in the document's PAGE_RES.
+// word_data holds the word to be recognized, and its block and row, and
+// pr_it points to the word as well, in case we are running LSTM and it wants
+// to output multiple words.
 // Recognizes in the current language, and if successful that is all.
 // If recognition was not successful, tries all available languages until
 // it gets a successful result or runs out of languages. Keeps the best result.
 void Tesseract::classify_word_and_language(WordRecognizer recognizer,
+                                           PAGE_RES_IT* pr_it,
                                           WordData* word_data) {
+  // Best result so far.
+  PointerVector<WERD_RES> best_words;
  // Points to the best result. May be word or in lang_words.
  WERD_RES* word = word_data->word;
  clock_t start_t = clock();
  if (classify_debug_level || cube_debug_level) {
-    tprintf("Processing word with lang %s at:",
+    tprintf("%s word with lang %s at:",
+            word->done ? "Already done" : "Processing",
            most_recently_used_->lang.string());
    word->word->bounding_box().print();
  }
-  const char* result_type = "Initial";
-  bool initially_done = !word->tess_failed && word->done;
-  if (initially_done) {
+  if (word->done) {
    // If done on pass1, leave it as-is.
-    most_recently_used_ = word->tesseract;
-    result_type = "Already done";
-  } else {
-    if (most_recently_used_ != this) {
-      // Point to the word for most_recently_used_.
-      for (int s = 0; s < sub_langs_.size(); ++s) {
-        if (most_recently_used_ == sub_langs_[s]) {
-          word = &word_data->lang_words[s];
-          break;
-        }
-      }
-    }
-    (most_recently_used_->*recognizer)(word_data, word);
-    if (!word->tess_failed && word->tess_accepted)
-      result_type = "Accepted";
+    if (!word->tess_failed)
+      most_recently_used_ = word->tesseract;
+    return;
  }
-  if (classify_debug_level || cube_debug_level) {
-    tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
-            " xht=[%g,%g]\n",
-            result_type,
-            word->best_choice->unichar_string().string(),
-            word->best_choice->rating(),
-            word->best_choice->certainty(),
-            word->tess_accepted, word->tess_would_adapt,
-            word->best_choice->min_x_height(),
-            word->best_choice->max_x_height());
+  int sub = sub_langs_.size();
+  if (most_recently_used_ != this) {
+    // Get the index of the most_recently_used_.
+    for (sub = 0; sub < sub_langs_.size() &&
+         most_recently_used_ != sub_langs_[sub]; ++sub) {}
  }
-  if (word->tess_failed || !word->tess_accepted) {
+  most_recently_used_->RetryWithLanguage(
+      *word_data, recognizer, &word_data->lang_words[sub], &best_words);
+  Tesseract* best_lang_tess = most_recently_used_;
+  if (!WordsAcceptable(best_words)) {
    // Try all the other languages to see if they are any better.
-    Tesseract* previous_used = most_recently_used_;
-    if (most_recently_used_ != this) {
-      if (classify_debug_level) {
-        tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
-      }
-      if (word_data->word->tesseract == this) {
-        // This is pass1, and we are trying the main language.
-        if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) {
-          most_recently_used_ = this;
-          word = word_data->word;
-        }
-      } else {
-        // This is pass2, and we are trying the main language again, but it
-        // has no word allocated to it, so we must re-initialize it.
-        WERD_RES main_word(*word_data->word);
-        main_word.InitForRetryRecognition(*word_data->word);
-        main_word.SetupForRecognition(unicharset, this, BestPix(),
-                                      tessedit_ocr_engine_mode, NULL,
-                                      classify_bln_numeric_mode,
-                                      textord_use_cjk_fp_model,
-                                      poly_allow_detailed_fx,
-                                      word_data->row, word_data->block);
-        if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) {
-          most_recently_used_ = this;
-          word_data->word->ConsumeWordResults(&main_word);
-          word = word_data->word;
-        }
-      }
-      if (!word->tess_failed && word->tess_accepted)
-        return;  // No need to look at the others.
+    if (most_recently_used_ != this &&
+        this->RetryWithLanguage(*word_data, recognizer,
+                                &word_data->lang_words[sub_langs_.size()],
+                                &best_words) > 0) {
+      best_lang_tess = this;
    }
-
-    for (int i = 0; i < sub_langs_.size(); ++i) {
-      if (sub_langs_[i] != previous_used) {
-        if (classify_debug_level) {
-          tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
-                  i, sub_langs_[i]->lang.string());
-        }
-        if (sub_langs_[i]->RetryWithLanguage(*word, word_data,
-                                             &word_data->lang_words[i],
-                                             recognizer)) {
-          most_recently_used_ = sub_langs_[i];
-          word = &word_data->lang_words[i];
-          if (!word->tess_failed && word->tess_accepted)
-            break;  // No need to look at the others.
-        }
+    for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
+         ++i) {
+      if (most_recently_used_ != sub_langs_[i] &&
+          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
+                                           &word_data->lang_words[i],
+                                           &best_words) > 0) {
+        best_lang_tess = sub_langs_[i];
      }
    }
  }
-  if (word != word_data->word) {
-    // Move the result for the best language to the main word.
-    word_data->word->ConsumeWordResults(word);
+  most_recently_used_ = best_lang_tess;
+  if (!best_words.empty()) {
+    if (best_words.size() == 1 && !best_words[0]->combination) {
+      // Move the best single result to the main word.
+      word_data->word->ConsumeWordResults(best_words[0]);
+    } else {
+      // Words came from LSTM, and must be moved to the PAGE_RES properly.
+      word_data->word = best_words.back();
+      pr_it->ReplaceCurrentWord(&best_words);
+    }
+    ASSERT_HOST(word_data->word->box_word != NULL);
+  } else {
+    tprintf("no best words!!\n");
  }
  clock_t ocr_t = clock();
  if (tessedit_timing_debug) {
@ -920,16 +979,19 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
 * Baseline normalize the word and pass it to Tess.
 */

-void Tesseract::classify_word_pass1(WordData* word_data, WERD_RES* word) {
-  ROW* row = word_data->row;
-  BLOCK* block = word_data->block;
-  prev_word_best_choice_ = word_data->prev_word != NULL
-      ? word_data->prev_word->word->best_choice : NULL;
+void Tesseract::classify_word_pass1(const WordData& word_data,
+                                    WERD_RES** in_word,
+                                    PointerVector<WERD_RES>* out_words) {
+  ROW* row = word_data.row;
+  BLOCK* block = word_data.block;
+  prev_word_best_choice_ = word_data.prev_word != NULL
+      ? word_data.prev_word->word->best_choice : NULL;
  // If we only intend to run cube - run it and return.
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
-    cube_word_pass1(block, row, word);
+    cube_word_pass1(block, row, *in_word);
    return;
  }
+  WERD_RES* word = *in_word;
  match_word_pass_n(1, word, row, block);
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
    word->tess_would_adapt = AdaptableWord(word);
@ -1027,19 +1089,23 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
 * Control what to do with the word in pass 2
 */

-void Tesseract::classify_word_pass2(WordData* word_data, WERD_RES* word) {
+void Tesseract::classify_word_pass2(const WordData& word_data,
+                                    WERD_RES** in_word,
+                                    PointerVector<WERD_RES>* out_words) {
  // Return if we do not want to run Tesseract.
  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
-      tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
+      tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED &&
+      word_data.word->best_choice != NULL)
    return;
-  ROW* row = word_data->row;
-  BLOCK* block = word_data->block;
-  prev_word_best_choice_ = word_data->prev_word != NULL
-      ? word_data->prev_word->word->best_choice : NULL;
+  ROW* row = word_data.row;
+  BLOCK* block = word_data.block;
+  WERD_RES* word = *in_word;
+  prev_word_best_choice_ = word_data.prev_word != NULL
+      ? word_data.prev_word->word->best_choice : NULL;

  set_global_subloc_code(SUBLOC_NORM);
  check_debug_pt(word, 30);
-  if (!word->done || tessedit_training_tess) {
+  if (!word->done) {
    word->caps_height = 0.0;
    if (word->x_height == 0.0f)
      word->x_height = row->x_height();
@ -1161,11 +1227,9 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
  const WERD_CHOICE &word = *(word_res->best_choice);

  // Find the frequency of each unique character in the word.
-  UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
  SortHelper<UNICHAR_ID> rep_ch(word.length());
  for (int i = 0; i < word.length(); ++i) {
-    if (word.unichar_id(i) != space)
-      rep_ch.Add(word.unichar_id(i), 1);
+    rep_ch.Add(word.unichar_id(i), 1);
  }

  // Find the most frequent result.
@ -1194,51 +1258,9 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
    ++gap_count;
    prev_blob = blob;
  }
-  if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
-    // Needs spaces between.
-    ExplodeRepeatedWord(best_choice, page_res_it);
-  } else {
-    // Just correct existing classification.
-    CorrectRepcharChoices(best_choice, word_res);
-    word_res->reject_map.initialise(word.length());
-  }
-}
-
-// Explode the word at the given iterator location into individual words
-// of a single given unichar_id defined by best_choice.
-// The original word is deleted, and the replacements copy most of their
-// fields from the original.
-void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
-                                    PAGE_RES_IT* page_res_it) {
-  WERD_RES *word_res = page_res_it->word();
-  ASSERT_HOST(best_choice != NULL);
-
-  // Make a new word for each blob in the original.
-  WERD* werd = word_res->word;
-  C_BLOB_IT blob_it(werd->cblob_list());
-  for (; !blob_it.empty(); blob_it.forward()) {
-    bool first_blob = blob_it.at_first();
-    bool last_blob = blob_it.at_last();
-    WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
-                                                    blob_it.extract());
-    // Note that blamer_bundle (truth information) is not copied, which is
-    // desirable, since the newly inserted words would not have the original
-    // bounding box corresponding to the one recorded in truth fields.
-    WERD_RES* rep_word =
-        page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
-    // Setup the single char WERD_RES
-    if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(),
-                                      tessedit_ocr_engine_mode, NULL, false,
-                                      textord_use_cjk_fp_model,
-                                      poly_allow_detailed_fx,
-                                      page_res_it->row()->row,
-                                      page_res_it->block()->block)) {
-      rep_word->CloneChoppedToRebuild();
-      BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
-      rep_word->FakeClassifyWord(1, &blob_choice);
-    }
-  }
-  page_res_it->DeleteCurrentWord();
+  // Just correct existing classification.
+  CorrectRepcharChoices(best_choice, word_res);
+  word_res->reject_map.initialise(word.length());
 }

 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
@ -1405,16 +1427,19 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
        show_map_detail = TRUE;
        break;
    }
-    tprintf(" \"%s\" ",
-            word->best_choice->unichar_string().string());
-    word->reject_map.print (debug_fp);
-    tprintf ("\n");
-    if (show_map_detail) {
-      tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
-      for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
-        tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
-        word->reject_map[i].full_print(debug_fp);
+    if (word->best_choice != NULL) {
+      tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
+      word->reject_map.print(debug_fp);
+      tprintf("\n");
+      if (show_map_detail) {
+        tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
+        for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
+          tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
+          word->reject_map[i].full_print(debug_fp);
+        }
      }
+    } else {
+      tprintf("null best choice\n");
    }
    tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
    tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@ -205,7 +205,8 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
    if ((!word->part_of_combo) && (word->box_word == NULL)) {
      WordData word_data(block, row, word);
      SetupWordPassN(2, &word_data);
-      classify_word_and_language(&Tesseract::classify_word_pass2, &word_data);
+      classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
+                                 &word_data);
    }
    prev_word_best_choice_ = word->best_choice;
  }
--- a/ccmain/pagewalk.cpp
+++ b/ccmain/pagewalk.cpp
@ -30,15 +30,12 @@ namespace tesseract {
 void Tesseract::process_selected_words(
    PAGE_RES* page_res, // blocks to check
    TBOX & selection_box,
-    BOOL8(tesseract::Tesseract::*word_processor)(  // function to call
-        BLOCK* block, ROW* row, WERD_RES* word_res)) {
+    BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) {
  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
       page_res_it.forward()) {
    WERD* word = page_res_it.word()->word;
    if (word->bounding_box().overlap(selection_box)) {
-      if (!((this->*word_processor)(page_res_it.block()->block,
-                                    page_res_it.row()->row,
-                                    page_res_it.word())))
+      if (!(this->*word_processor)(&page_res_it))
        return;
    }
  }
--- a/ccmain/par_control.cpp
+++ b/ccmain/par_control.cpp
@ -39,13 +39,11 @@ void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
  for (int w = 0; w < words.size(); ++w) {
    if (words[w].word->ratings != NULL &&
        words[w].word->ratings->get(0, 0) == NULL) {
-      for (int b = 0; b < words[w].word->chopped_word->NumBlobs(); ++b) {
-        blobs.push_back(BlobData(b, this, *words[w].word));
-      }
      for (int s = 0; s < words[w].lang_words.size(); ++s) {
-        const WERD_RES& word = words[w].lang_words[s];
+        Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
+        const WERD_RES& word = *words[w].lang_words[s];
        for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
-          blobs.push_back(BlobData(b, sub_langs_[s], word));
+          blobs.push_back(BlobData(b, sub, word));
        }
      }
    }
--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
@ -306,10 +306,7 @@ SVMenuNode *Tesseract::build_menu_new() {
 *  Redisplay page
 */
 void Tesseract::do_re_display(
-    BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
-                                                ROW* row,
-                                                WERD_RES* word_res)) {
-  PAGE_RES_IT pr_it(current_page_res);
+    BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it)) {
  int block_count = 1;

  image_win->Clear();
@ -317,8 +314,9 @@ void Tesseract::do_re_display(
    image_win->Image(pix_binary_, 0, 0);
  }

+  PAGE_RES_IT pr_it(current_page_res);
  for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
-    (this->*word_painter)(pr_it.block()->block, pr_it.row()->row, word);
+    (this->*word_painter)(&pr_it);
    if (display_baselines && pr_it.row() != pr_it.prev_row())
      pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
    if (display_blocks && pr_it.block() != pr_it.prev_block())
@ -714,11 +712,10 @@ void show_point(PAGE_RES* page_res, float x, float y) {
 #endif  // GRAPHICS_DISABLED
 namespace tesseract {
 #ifndef GRAPHICS_DISABLED
-BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row,
-                                             WERD_RES* word_res) {
-  word_res->word->bounding_box().plot(image_win, ScrollView::BLACK,
-                                      ScrollView::BLACK);
-  return word_set_display(block, row, word_res);
+BOOL8 Tesseract:: word_blank_and_set_display(PAGE_RES_IT* pr_it) {
+  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
+                                           ScrollView::BLACK);
+  return word_set_display(pr_it);
 }


@ -727,7 +724,8 @@ BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row,
 *
 * Normalize word and display in word window
 */
-BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
+BOOL8 Tesseract::word_bln_display(PAGE_RES_IT* pr_it) {
+  WERD_RES* word_res = pr_it->word();
  if (word_res->chopped_word == NULL) {
    // Setup word normalization parameters.
    word_res->SetupForRecognition(unicharset, this, BestPix(),
@ -735,7 +733,7 @@ BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
                                  classify_bln_numeric_mode,
                                  textord_use_cjk_fp_model,
                                  poly_allow_detailed_fx,
-                                  row, block);
+                                  pr_it->row()->row, pr_it->block()->block);
  }
  bln_word_window_handle()->Clear();
  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
@ -758,7 +756,8 @@ BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
 *
 *  Display a word according to its display modes
 */
-BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
+BOOL8 Tesseract::word_display(PAGE_RES_IT* pr_it) {
+  WERD_RES* word_res = pr_it->word();
  WERD* word = word_res->word;
  TBOX word_bb;                   // word bounding box
  int word_height;               // ht of word BB
@ -918,14 +917,15 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
 *
 * Dump members to the debug window
 */
-BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) {
-  if (block != NULL) {
+BOOL8 Tesseract::word_dumper(PAGE_RES_IT* pr_it) {
+  if (pr_it->block()->block != NULL) {
    tprintf("\nBlock data...\n");
-    block->print(NULL, FALSE);
+    pr_it->block()->block->print(NULL, FALSE);
  }
  tprintf("\nRow data...\n");
-  row->print(NULL);
+  pr_it->row()->row->print(NULL);
  tprintf("\nWord data...\n");
+  WERD_RES* word_res = pr_it->word();
  word_res->word->print();
  if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
      word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
@ -941,8 +941,8 @@ BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) {
 *
 * Display word according to current display mode settings
 */
-BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
-  WERD* word = word_res->word;
+BOOL8 Tesseract::word_set_display(PAGE_RES_IT* pr_it) {
+  WERD* word = pr_it->word()->word;
  word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
  word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
  word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
@ -950,26 +950,24 @@ BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
  word->set_display_flag(DF_BN_POLYGONAL,
    word_display_mode.bit(DF_BN_POLYGONAL));
  word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
-  return word_display(block, row, word_res);
+  return word_display(pr_it);
 }

 // page_res is non-const because the iterator doesn't know if you are going
 // to change the items it points to! Really a const here though.
 void Tesseract::blob_feature_display(PAGE_RES* page_res,
                                     const TBOX& selection_box) {
-  ROW* row;               // row of word
-  BLOCK* block;           // block of word
-  WERD* word = make_pseudo_word(page_res, selection_box, block, row);
-  if (word != NULL) {
-    WERD_RES word_res(word);
-    word_res.x_height = row->x_height();
-    word_res.SetupForRecognition(unicharset, this, BestPix(),
-                                 tessedit_ocr_engine_mode, NULL,
-                                 classify_bln_numeric_mode,
-                                 textord_use_cjk_fp_model,
-                                 poly_allow_detailed_fx,
-                                 row, block);
-    TWERD* bln_word = word_res.chopped_word;
+  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
+  if (it != NULL) {
+    WERD_RES* word_res = it->word();
+    word_res->x_height = it->row()->row->x_height();
+    word_res->SetupForRecognition(unicharset, this, BestPix(),
+                                  tessedit_ocr_engine_mode, NULL,
+                                  classify_bln_numeric_mode,
+                                  textord_use_cjk_fp_model,
+                                  poly_allow_detailed_fx,
+                                  it->row()->row, it->block()->block);
+    TWERD* bln_word = word_res->chopped_word;
    TBLOB* bln_blob = bln_word->blobs[0];
    INT_FX_RESULT_STRUCT fx_info;
    GenericVector<INT_FEATURE_STRUCT> bl_features;
@ -989,7 +987,8 @@ void Tesseract::blob_feature_display(PAGE_RES* page_res,
      RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
    cn_win->Update();

-    delete word;
+    it->DeleteCurrentWord();
+    delete it;
  }
 }

--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@ -51,15 +51,11 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {

 // Copies the bounding box from page_res_it->word() to the given TBOX.
 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
-  while (page_res_it->block() != NULL) {
-    if (page_res_it->word() != NULL)
-      break;
+  while (page_res_it->block() != NULL && page_res_it->word() == NULL)
    page_res_it->forward();
-  }

  if (page_res_it->word() != NULL) {
    *tbox = page_res_it->word()->word->bounding_box();
-    page_res_it->forward();

    // If tbox->left() is negative, the training image has vertical text and
    // all the coordinates of bounding boxes of page_res are rotated by 90
@ -109,26 +105,34 @@ void Tesseract::recog_training_segmented(const STRING &fname,
    // Align bottom left points of the TBOXes.
    while (keep_going &&
           !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
-      keep_going = (bbox.bottom() < tbox.bottom()) ?
-          read_t(&page_res_it, &tbox) :
-            ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+      if (bbox.bottom() < tbox.bottom()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
+                                 &bbox);
+      }
    }
    while (keep_going &&
           !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
-      keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
-          ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+      if (bbox.left() > tbox.left()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
+                                 &bbox);
+      }
    }
    // OCR the word if top right points of the TBOXes are similar.
    if (keep_going &&
        NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
-        ambigs_classify_and_output(page_res_it.prev_word(),
-                                   page_res_it.prev_row(),
-                                   page_res_it.prev_block(),
-                                   label.string(), output_file);
+        ambigs_classify_and_output(label.string(), &page_res_it, output_file);
        examined_words++;
    }
+    page_res_it.forward();
  } while (keep_going);
+  fclose(box_file);

  // Set up scripts on all of the words that did not get sent to
  // ambigs_classify_and_output.  They all should have, but if all the
@ -196,16 +200,16 @@ static void PrintMatrixPaths(int col, int dim,
 // raw choice as a result of the classification. For words labeled with a
 // single unichar also outputs all alternatives from blob_choices of the
 // best choice.
-void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
-                                           ROW_RES *row_res,
-                                           BLOCK_RES *block_res,
-                                           const char *label,
+void Tesseract::ambigs_classify_and_output(const char *label,
+                                           PAGE_RES_IT* pr_it,
                                           FILE *output_file) {
  // Classify word.
  fflush(stdout);
-  WordData word_data(block_res->block, row_res->row, werd_res);
+  WordData word_data(*pr_it);
  SetupWordPassN(1, &word_data);
-  classify_word_pass1(&word_data, werd_res);
+  classify_word_and_language(&Tesseract::classify_word_pass1,
+                             pr_it, &word_data);
+  WERD_RES* werd_res = word_data.word;
  WERD_CHOICE *best_choice = werd_res->best_choice;
  ASSERT_HOST(best_choice != NULL);

--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@ -96,8 +96,6 @@ Tesseract::Tesseract()
                " whose outlines overlap horizontally.", this->params()),
    BOOL_MEMBER(tessedit_display_outwords, false,
                "Draw output words", this->params()),
-    BOOL_MEMBER(tessedit_training_tess, false,
-                "Call Tess to learn blobs", this->params()),
    BOOL_MEMBER(tessedit_dump_choices, false,
                "Dump char choices", this->params()),
    BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
@ -315,16 +313,6 @@ Tesseract::Tesseract()
                "Write .html hOCR output file", this->params()),
    BOOL_MEMBER(tessedit_create_pdf, false,
                "Write .pdf output file", this->params()),
-    INT_MEMBER(tessedit_pdf_compression, 0,
-               "Type of image compression in pdf output: "
-               "0 - autoselection (default); "
-               "1 - jpeg; "
-               "2 - G4; "
-               "3 - flate",
-               this->params()),
-    INT_MEMBER(tessedit_pdf_jpg_quality, 85,
-               "Quality level of jpeg image compression in pdf output",
-               this->params()),
    STRING_MEMBER(unrecognised_char, "|",
                  "Output char for unidentified blobs", this->params()),
    INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@ -31,20 +31,20 @@
 #include "textord.h"
 #include "wordrec.h"

-class PAGE_RES;
-class PAGE_RES_IT;
+class BLOB_CHOICE_LIST_CLIST;
 class BLOCK_LIST;
 class CharSamp;
-class TO_BLOCK_LIST;
-class WERD_RES;
-class ROW;
-class TBOX;
-class SVMenuNode;
-struct Pix;
-class WERD_CHOICE;
-class WERD;
-class BLOB_CHOICE_LIST_CLIST;
 struct OSResults;
+class PAGE_RES;
+class PAGE_RES_IT;
+struct Pix;
+class ROW;
+class SVMenuNode;
+class TBOX;
+class TO_BLOCK_LIST;
+class WERD;
+class WERD_CHOICE;
+class WERD_RES;


 // Top-level class for all tesseract global instance data.
@ -144,10 +144,19 @@ struct WordData {
  ROW* row;
  BLOCK* block;
  WordData* prev_word;
-  GenericVector<WERD_RES> lang_words;
+  PointerVector<WERD_RES> lang_words;
 };

-typedef void (Tesseract::*WordRecognizer)(WordData* word_data, WERD_RES* word);
+// Definition of a Tesseract WordRecognizer. The WordData provides the context
+// of row/block, in_word holds an initialized, possibly pre-classified word,
+// that the recognizer may or may not consume (but if so it sets *in_word=NULL)
+// and produces one or more output words in out_words, which may be the
+// consumed in_word, or may be generated independently.
+// This api allows both a conventional tesseract classifier to work, or a
+// line-level classifier that generates multiple words from a merged input.
+typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
+                                          WERD_RES** in_word,
+                                          PointerVector<WERD_RES>* out_words);

 class Tesseract : public Wordrec {
 public:
@ -279,6 +288,7 @@ class Tesseract : public Wordrec {
  void SetupWordPassN(int pass_n, WordData* word);
  // Runs word recognition on all the words.
  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
+                          PAGE_RES_IT* pr_it,
                          GenericVector<WordData>* words);
  bool recog_all_words(PAGE_RES* page_res,
                       ETEXT_DESC* monitor,
@ -294,28 +304,35 @@ class Tesseract : public Wordrec {
  // Sets script positions and detects smallcaps on all output words.
  void script_pos_pass(PAGE_RES* page_res);
  // Helper to recognize the word using the given (language-specific) tesseract.
-  // Returns true if the result was better than previously.
-  bool RetryWithLanguage(const WERD_RES& best_word, WordData* word_data,
-                         WERD_RES* word, WordRecognizer recognizer);
+  // Returns positive if this recognizer found more new best words than the
+  // number kept from best_words.
+  int RetryWithLanguage(const WordData& word_data,
+                        WordRecognizer recognizer,
+                        WERD_RES** in_word,
+                        PointerVector<WERD_RES>* best_words);
  void classify_word_and_language(WordRecognizer recognizer,
+                                  PAGE_RES_IT* pr_it,
                                  WordData* word_data);
-  void classify_word_pass1(WordData* word_data, WERD_RES* word);
+  void classify_word_pass1(const WordData& word_data,
+                           WERD_RES** in_word,
+                           PointerVector<WERD_RES>* out_words);
  void recog_pseudo_word(PAGE_RES* page_res,  // blocks to check
                         TBOX &selection_box);

  void fix_rep_char(PAGE_RES_IT* page_res_it);
-  void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);

  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
                                              const char *s,
                                              const char *lengths);
  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
-  void classify_word_pass2(WordData* word_data, WERD_RES* word);
+  void classify_word_pass2(const WordData& word_data,
+                           WERD_RES** in_word,
+                           PointerVector<WERD_RES>* out_words);
  void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
                          WERD_RES* word, WERD_RES* new_word);
  bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
-  BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
+  BOOL8 recog_interactive(PAGE_RES_IT* pr_it);

  // Set fonts of this word.
  void set_word_fonts(WERD_RES *word);
@ -473,15 +490,13 @@ class Tesseract : public Wordrec {
                             );
  void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
  void do_re_display(
-      BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
-                                                  ROW* row,
-                                                  WERD_RES* word_res));
-  BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res);
-  BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res);
-  BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
-  BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
+      BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
+  BOOL8 word_display(PAGE_RES_IT* pr_it);
+  BOOL8 word_bln_display(PAGE_RES_IT* pr_it);
+  BOOL8 word_blank_and_set_display(PAGE_RES_IT* pr_its);
+  BOOL8 word_set_display(PAGE_RES_IT* pr_it);
  // #ifndef GRAPHICS_DISABLED
-  BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res);
+  BOOL8 word_dumper(PAGE_RES_IT* pr_it);
  // #endif  // GRAPHICS_DISABLED
  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
  //// reject.h //////////////////////////////////////////////////////////
@ -537,10 +552,7 @@ class Tesseract : public Wordrec {
  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
  inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
-  void fix_fuzzy_space_list(  //space explorer
-                            WERD_RES_LIST &best_perm,
-                            ROW *row,
-                            BLOCK* block);
+  void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
  void fix_fuzzy_spaces(                      //find fuzzy words
                        ETEXT_DESC *monitor,  //progress monitor
@ -583,9 +595,7 @@ class Tesseract : public Wordrec {
      PAGE_RES* page_res, // blocks to check
      //function to call
      TBOX & selection_box,
-      BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block,
-                                                     ROW* row,
-                                                     WERD_RES* word_res));
+      BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
  //// tessbox.cpp ///////////////////////////////////////////////////////
  void tess_add_doc_word(                          //test acceptability
                         WERD_CHOICE *word_choice  //after context
@ -752,7 +762,6 @@ class Tesseract : public Wordrec {
             "Each bounding box is assumed to contain ngrams. Only"
             " learn the ngrams whose outlines overlap horizontally.");
  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
-  BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
  BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
@ -908,13 +917,6 @@ class Tesseract : public Wordrec {
  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
-  INT_VAR_H(tessedit_pdf_compression, 0, "Type of image encoding in pdf output:"
-            "0 - autoselection (default); "
-            "1 - jpeg; "
-            "2 - G4; "
-            "3 - flate");
-  INT_VAR_H(tessedit_pdf_jpg_quality, 85, "Quality level of jpeg image "
-            "compression in pdf output");
  STRING_VAR_H(unrecognised_char, "|",
               "Output char for unidentified blobs");
  INT_VAR_H(suspect_level, 99, "Suspect marker level");
@ -1046,10 +1048,8 @@ class Tesseract : public Wordrec {
                                PAGE_RES *page_res,
                                volatile ETEXT_DESC *monitor,
                                FILE *output_file);
-  void ambigs_classify_and_output(WERD_RES *werd_res,
-                                  ROW_RES *row_res,
-                                  BLOCK_RES *block_res,
-                                  const char *label,
+  void ambigs_classify_and_output(const char *label,
+                                  PAGE_RES_IT* pr_it,
                                  FILE *output_file);

  inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
--- a/ccmain/thresholder.cpp
+++ b/ccmain/thresholder.cpp
@ -171,7 +171,7 @@ void ImageThresholder::SetImage(const Pix* pix) {
 // Threshold the source image as efficiently as possible to the output Pix.
 // Creates a Pix and sets pix to point to the resulting pointer.
 // Caller must use pixDestroy to free the created Pix.
-void ImageThresholder::ThresholdToPix(Pix** pix) {
+void ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) {
  if (pix_channels_ == 0) {
    // We have a binary image, so it just has to be cloned.
    *pix = GetPixRect();
--- a/ccmain/thresholder.h
+++ b/ccmain/thresholder.h
@ -20,7 +20,8 @@
 #ifndef TESSERACT_CCMAIN_THRESHOLDER_H__
 #define TESSERACT_CCMAIN_THRESHOLDER_H__

-#include          "platform.h"
+#include "platform.h"
+#include "publictypes.h"

 struct Pix;

@ -116,7 +117,7 @@ class TESS_API ImageThresholder {
  /// Threshold the source image as efficiently as possible to the output Pix.
  /// Creates a Pix and sets pix to point to the resulting pointer.
  /// Caller must use pixDestroy to free the created Pix.
-  virtual void ThresholdToPix(Pix** pix);
+  virtual void ThresholdToPix(PageSegMode pageseg_mode, Pix** pix);

  // Gets a pix that contains an 8 bit threshold value at each pixel. The
  // returned pix may be an integer reduction of the binary image such that
--- a/ccmain/werdit.cpp
+++ b/ccmain/werdit.cpp
@ -23,17 +23,15 @@
 * make_pseudo_word
 *
 * Make all the blobs inside a selection into a single word.
- * The word is always a copy and needs to be deleted.
+ * The returned PAGE_RES_IT* it points to the new word. After use, call
+ * it->DeleteCurrentWord() to delete the fake word, and then
+ * delete it to get rid of the iterator itself.
 **********************************************************************/

-WERD *make_pseudo_word(PAGE_RES* page_res,  // Blocks to check.
-                       const TBOX &selection_box,
-                       BLOCK *&pseudo_block,
-                       ROW *&pseudo_row) {      // Row of selection.
+PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
  PAGE_RES_IT pr_it(page_res);
  C_BLOB_LIST new_blobs;               // list of gathered blobs
  C_BLOB_IT new_blob_it = &new_blobs;  // iterator
-  WERD *pseudo_word;                   // fabricated word

  for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
@ -45,15 +43,17 @@ WERD *make_pseudo_word(PAGE_RES* page_res,  // Blocks to check.
        C_BLOB* blob = blob_it.data();
        if (blob->bounding_box().overlap(selection_box)) {
          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
-          pseudo_row = pr_it.row()->row;
-          pseudo_block = pr_it.block()->block;
        }
      }
+      if (!new_blobs.empty()) {
+        WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
+        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
+        PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
+        while (it->word() != word_res && it->word() != NULL) it->forward();
+        ASSERT_HOST(it->word() == word_res);
+        return it;
+      }
    }
  }
-  if (!new_blobs.empty())
-    pseudo_word = new WERD(&new_blobs, 1, NULL);
-  else
-    pseudo_word = NULL;
-  return pseudo_word;
+  return NULL;
 }
--- a/ccmain/werdit.h
+++ b/ccmain/werdit.h
@ -22,9 +22,6 @@

 #include          "pageres.h"

-WERD *make_pseudo_word(PAGE_RES* page_res,  // blocks to check
-                       const TBOX &selection_box,
-                       BLOCK *&pseudo_block,
-                       ROW *&pseudo_row);
+PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box);

 #endif
--- a/ccstruct/boxword.cpp
+++ b/ccstruct/boxword.cpp
@ -157,6 +157,13 @@ void BoxWord::InsertBox(int index, const TBOX& box) {
  ComputeBoundingBox();
 }

+// Changes the box at the given index to the new box.
+// Recomputes the bounding box.
+void BoxWord::ChangeBox(int index, const TBOX& box) {
+  boxes_[index] = box;
+  ComputeBoundingBox();
+}
+
 // Deletes the box with the given index, and shuffles up the rest.
 // Recomputes the bounding box.
 void BoxWord::DeleteBox(int index) {
--- a/ccstruct/boxword.h
+++ b/ccstruct/boxword.h
@ -63,6 +63,10 @@ class BoxWord {
  // Recomputes the bounding box.
  void InsertBox(int index, const TBOX& box);

+  // Changes the box at the given index to the new box.
+  // Recomputes the bounding box.
+  void ChangeBox(int index, const TBOX& box);
+
  // Deletes the box with the given index, and shuffles up the rest.
  // Recomputes the bounding box.
  void DeleteBox(int index);
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@ -34,6 +34,13 @@ static const double kStopperAmbiguityThresholdGain = 8.0;
 static const double kStopperAmbiguityThresholdOffset = 1.5;
 // Max number of broken pieces to associate.
 const int kWordrecMaxNumJoinChunks = 4;
+// Max ratio of word box height to line size to allow it to be processed as
+// a line with other words.
+const double kMaxWordSizeRatio = 1.25;
+// Max ratio of line box height to line size to allow a new word to be added.
+const double kMaxLineSizeRatio = 1.25;
+// Max ratio of word gap to line size to allow a new word to be added.
+const double kMaxWordGapRatio = 2.0;

 // Computes and returns a threshold of certainty difference used to determine
 // which words to keep, based on the adjustment factors of the two words.
@ -49,6 +56,7 @@ static double StopperAmbigThreshold(double f1, double f2) {
 * Constructor for page results
 *************************************************************************/
 PAGE_RES::PAGE_RES(
+    bool merge_similar_words,
    BLOCK_LIST *the_block_list,
    WERD_CHOICE **prev_word_best_choice_ptr) {
  Init();
@ -56,7 +64,8 @@ PAGE_RES::PAGE_RES(
  BLOCK_RES_IT block_res_it(&block_res_list);
  for (block_it.mark_cycle_pt();
       !block_it.cycled_list(); block_it.forward()) {
-    block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
+    block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
+                                          block_it.data()));
  }
  prev_word_best_choice = prev_word_best_choice_ptr;
 }
@ -67,7 +76,7 @@ PAGE_RES::PAGE_RES(
 * Constructor for BLOCK results
 *************************************************************************/

-BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
+BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
  ROW_IT row_it (the_block->row_list ());
  ROW_RES_IT row_res_it(&row_res_list);

@ -83,22 +92,20 @@ BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
  block = the_block;

  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
-    row_res_it.add_to_end(new ROW_RES(row_it.data()));
+    row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
  }
 }

-
 /*************************************************************************
 * ROW_RES::ROW_RES
 *
 * Constructor for ROW results
 *************************************************************************/

-ROW_RES::ROW_RES(ROW *the_row) {
+ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
  WERD_IT word_it(the_row->word_list());
  WERD_RES_IT word_res_it(&word_res_list);
  WERD_RES *combo = NULL;        // current combination of fuzzies
-  WERD_RES *word_res;            // current word
  WERD *copy_word;

  char_count = 0;
@ -106,20 +113,48 @@ ROW_RES::ROW_RES(ROW *the_row) {
  whole_word_rej_count = 0;

  row = the_row;
+  bool add_next_word = false;
+  TBOX union_box;
+  float line_height = the_row->x_height() + the_row->ascenders() -
+      the_row->descenders();
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
-    word_res = new WERD_RES(word_it.data());
+    WERD_RES* word_res = new WERD_RES(word_it.data());
    word_res->x_height = the_row->x_height();
-
-    if (word_res->word->flag(W_FUZZY_NON)) {
+    if (add_next_word) {
      ASSERT_HOST(combo != NULL);
+      // We are adding this word to the combination.
      word_res->part_of_combo = TRUE;
      combo->copy_on(word_res);
+    } else if (merge_similar_words) {
+      union_box = word_res->word->bounding_box();
+      add_next_word = !word_res->word->flag(W_REP_CHAR) &&
+          union_box.height() <= line_height * kMaxWordSizeRatio;
+      word_res->odd_size = !add_next_word;
    }
-    if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
+    WERD* next_word = word_it.data_relative(1);
+    if (merge_similar_words) {
+      if (add_next_word && !next_word->flag(W_REP_CHAR)) {
+        // Next word will be added on if all of the following are true:
+        // Not a rep char.
+        // Box height small enough.
+        // Union box height small enough.
+        // Horizontal gap small enough.
+        TBOX next_box = next_word->bounding_box();
+        int prev_right = union_box.right();
+        union_box += next_box;
+        if (next_box.height() > line_height * kMaxWordSizeRatio ||
+            union_box.height() > line_height * kMaxLineSizeRatio ||
+            next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
+          add_next_word = false;
+        }
+      }
+    } else {
+      add_next_word = next_word->flag(W_FUZZY_NON);
+    }
+    if (add_next_word) {
      if (combo == NULL) {
        copy_word = new WERD;
-                                 //deep copy
-        *copy_word = *(word_it.data());
+        *copy_word = *(word_it.data());  // deep copy
        combo = new WERD_RES(copy_word);
        combo->x_height = the_row->x_height();
        combo->combination = TRUE;
@ -208,6 +243,7 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) {
  done = source.done;
  unlv_crunch_mode = source.unlv_crunch_mode;
  small_caps = source.small_caps;
+  odd_size = source.odd_size;
  italic = source.italic;
  bold = source.bold;
  fontinfo = source.fontinfo;
@ -318,8 +354,7 @@ void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
      TBOX box = b_it.data()->bounding_box();
      box_word->InsertBox(box_word->length(), box);
-      fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
-                                                -1, -1, -1, 0, 0, 0, BCC_FAKE);
+      fake_choices[blob_id++] = new BLOB_CHOICE;
    }
    FakeClassifyWord(blob_count, fake_choices);
    delete [] fake_choices;
@ -446,6 +481,13 @@ void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
  }
 }

+// Prints the top choice along with the accepted/done flags.
+void WERD_RES::DebugTopChoice(const char* msg) const {
+  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
+          tess_accepted, tess_would_adapt, done);
+  best_choice->print(msg);
+}
+
 // Removes from best_choices all choices which are not within a reasonable
 // range of the best choice.
 // TODO(rays) incorporate the information used here into the params training
@ -830,6 +872,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
  }
  FakeWordFromRatings();
  reject_map.initialise(blob_count);
+  done = true;
 }

 // Creates a WERD_CHOICE for the word using the top choices from the leading
@ -1038,6 +1081,7 @@ void WERD_RES::InitNonPointers() {
  done = FALSE;
  unlv_crunch_mode = CR_NONE;
  small_caps = false;
+  odd_size = false;
  italic = FALSE;
  bold = FALSE;
  // The fontinfos and tesseract count as non-pointers as they point to
@ -1239,6 +1283,159 @@ WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
  return new_res;
 }

+// Helper computes the boundaries between blobs in the word. The blob bounds
+// are likely very poor, if they come from LSTM, where it only outputs the
+// character at one pixel within it, so we find the midpoints between them.
+static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
+                            GenericVector<int>* blob_ends) {
+  C_BLOB_IT blob_it(word.word->cblob_list());
+  for (int i = 0; i < word.best_state.size(); ++i) {
+    int length = word.best_state[i];
+    // Get the bounding box of the fake blobs
+    TBOX blob_box = blob_it.data()->bounding_box();
+    blob_it.forward();
+    for (int b = 1; b < length; ++b) {
+      blob_box += blob_it.data()->bounding_box();
+      blob_it.forward();
+    }
+    // This blob_box is crap, so for now we are only looking for the
+    // boundaries between them.
+    int blob_end = MAX_INT32;
+    if (!blob_it.at_first() || next_word_blobs != NULL) {
+      if (blob_it.at_first())
+        blob_it.set_to_list(next_word_blobs);
+      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
+    }
+    blob_ends->push_back(blob_end);
+  }
+}
+
+// Replaces the current WERD/WERD_RES with the given words. The given words
+// contain fake blobs that indicate the position of the characters. These are
+// replaced with real blobs from the current word as much as possible.
+void PAGE_RES_IT::ReplaceCurrentWord(
+    tesseract::PointerVector<WERD_RES>* words) {
+  WERD_RES* input_word = word();
+  // Set the BOL/EOL flags on the words from the input word.
+  if (input_word->word->flag(W_BOL)) {
+    (*words)[0]->word->set_flag(W_BOL, true);
+  } else {
+    (*words)[0]->word->set_blanks(1);
+  }
+  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
+
+  // Move the blobs from the input word to the new set of words.
+  // If the input word_res is a combination, then the replacements will also be
+  // combinations, and will own their own words. If the input word_res is not a
+  // combination, then the final replacements will not be either, (although it
+  // is allowed for the input words to be combinations) and their words
+  // will get put on the row list. This maintains the ownership rules.
+  WERD_IT w_it(row()->row->word_list());
+  if (!input_word->combination) {
+    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+      WERD* word = w_it.data();
+      if (word == input_word->word)
+        break;
+    }
+    // w_it is now set to the input_word's word.
+    ASSERT_HOST(!w_it.cycled_list());
+  }
+  // Insert into the appropriate place in the ROW_RES.
+  WERD_RES_IT wr_it(&row()->word_res_list);
+  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
+    WERD_RES* word = wr_it.data();
+    if (word == input_word)
+      break;
+  }
+  ASSERT_HOST(!wr_it.cycled_list());
+  // Since we only have an estimate of the bounds between blobs, use the blob
+  // x-middle as the determiner of where to put the blobs
+  C_BLOB_IT src_b_it(input_word->word->cblob_list());
+  src_b_it.sort(&C_BLOB::SortByXMiddle);
+  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
+  rej_b_it.sort(&C_BLOB::SortByXMiddle);
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES* word_w = (*words)[w];
+    // Compute blob boundaries.
+    GenericVector<int> blob_ends;
+    C_BLOB_LIST* next_word_blobs =
+        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : NULL;
+    ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
+    // Delete the fake blobs on the current word.
+    word_w->word->cblob_list()->clear();
+    C_BLOB_IT dest_it(word_w->word->cblob_list());
+    // Build the box word as we move the blobs.
+    tesseract::BoxWord* box_word = new tesseract::BoxWord;
+    for (int i = 0; i < blob_ends.size(); ++i) {
+      int end_x = blob_ends[i];
+      TBOX blob_box;
+      // Add the blobs up to end_x.
+      while (!src_b_it.empty() &&
+             src_b_it.data()->bounding_box().x_middle() < end_x) {
+        blob_box += src_b_it.data()->bounding_box();
+        dest_it.add_after_then_move(src_b_it.extract());
+        src_b_it.forward();
+      }
+      while (!rej_b_it.empty() &&
+             rej_b_it.data()->bounding_box().x_middle() < end_x) {
+        blob_box += rej_b_it.data()->bounding_box();
+        dest_it.add_after_then_move(rej_b_it.extract());
+        rej_b_it.forward();
+      }
+      // Clip to the previously computed bounds. Although imperfectly accurate,
+      // it is good enough, and much more complicated to determine where else
+      // to clip.
+      if (i > 0 && blob_box.left() < blob_ends[i - 1])
+        blob_box.set_left(blob_ends[i - 1]);
+      if (blob_box.right() > end_x)
+        blob_box.set_right(end_x);
+      box_word->InsertBox(i, blob_box);
+    }
+    // Fix empty boxes. If a very joined blob sits over multiple characters,
+    // then we will have some empty boxes from using the middle, so look for
+    // overlaps.
+    for (int i = 0; i < box_word->length(); ++i) {
+      TBOX box = box_word->BlobBox(i);
+      if (box.null_box()) {
+        // Nothing has its middle in the bounds of this blob, so use anything
+        // that overlaps.
+        for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
+             dest_it.forward()) {
+          TBOX blob_box = dest_it.data()->bounding_box();
+          if (blob_box.left() < blob_ends[i] &&
+              (i == 0 || blob_box.right() >= blob_ends[i - 1])) {
+            if (i > 0 && blob_box.left() < blob_ends[i - 1])
+              blob_box.set_left(blob_ends[i - 1]);
+            if (blob_box.right() > blob_ends[i])
+              blob_box.set_right(blob_ends[i]);
+            box_word->ChangeBox(i, blob_box);
+            break;
+          }
+        }
+      }
+    }
+    delete word_w->box_word;
+    word_w->box_word = box_word;
+    if (!input_word->combination) {
+      // Insert word_w->word into the ROW. It doesn't own its word, so the
+      // ROW needs to own it.
+      w_it.add_before_stay_put(word_w->word);
+      word_w->combination = false;
+    }
+    (*words)[w] = NULL;  // We are taking ownership.
+    wr_it.add_before_stay_put(word_w);
+  }
+  // We have taken ownership of the words.
+  words->clear();
+  // Delete the current word, which has been replaced. We could just call
+  // DeleteCurrentWord, but that would iterate both lists again, and we know
+  // we are already in the right place.
+  if (!input_word->combination)
+    delete w_it.extract();
+  delete wr_it.extract();
+  ResetWordIterator();
+}
+
 // Deletes the current WERD_RES and its underlying WERD.
 void PAGE_RES_IT::DeleteCurrentWord() {
  // Check that this word is as we expect. part_of_combos are NEVER iterated
@ -1298,18 +1495,30 @@ WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
 // Resets the word_res_it so that it is one past the next_word_res, as
 // it should be after internal_forward. If next_row_res != row_res,
 // then the next_word_res is in the next row, so there is no need to do
-// anything, since operations on the current word will not have disturbed
-// the word_res_it.
+// anything to word_res_it, but it is still a good idea to reset the pointers
+// word_res and prev_word_res, which are still in the current row.
 void PAGE_RES_IT::ResetWordIterator() {
  if (row_res == next_row_res) {
    // Reset the member iterator so it can move forward and detect the
    // cycled_list state correctly.
    word_res_it.move_to_first();
    word_res_it.mark_cycle_pt();
-    while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
+    while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) {
+      if (prev_row_res == row_res)
+        prev_word_res = word_res;
+      word_res = word_res_it.data();
      word_res_it.forward();
+    }
    ASSERT_HOST(!word_res_it.cycled_list());
    word_res_it.forward();
+  } else {
+    // word_res_it is OK, but reset word_res and prev_word_res if needed.
+    WERD_RES_IT wr_it(&row_res->word_res_list);
+    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
+      if (prev_row_res == row_res)
+        prev_word_res = word_res;
+      word_res = wr_it.data();
+    }
  }
 }

--- a/ccstruct/pageres.h
+++ b/ccstruct/pageres.h
@ -82,7 +82,8 @@ class PAGE_RES {                 // page result

  PAGE_RES() { Init(); }  // empty constructor

-  PAGE_RES(BLOCK_LIST *block_list,   // real blocks
+  PAGE_RES(bool merge_similar_words,
+           BLOCK_LIST *block_list,   // real blocks
           WERD_CHOICE **prev_word_best_choice_ptr);

  ~PAGE_RES () {               // destructor
@ -111,7 +112,7 @@ class BLOCK_RES:public ELIST_LINK {
  BLOCK_RES() {
  }                            // empty constructor

-  BLOCK_RES(BLOCK *the_block);  // real block
+  BLOCK_RES(bool merge_similar_words, BLOCK *the_block);  // real block

  ~BLOCK_RES () {              // destructor
  }
@ -132,7 +133,7 @@ class ROW_RES:public ELIST_LINK {
  ROW_RES() {
  }                            // empty constructor

-  ROW_RES(ROW *the_row);  // real row
+  ROW_RES(bool merge_similar_words, ROW *the_row);  // real row

  ~ROW_RES() {                // destructor
  }
@ -279,7 +280,8 @@ class WERD_RES : public ELIST_LINK {
  BOOL8 tess_accepted;          // Tess thinks its ok?
  BOOL8 tess_would_adapt;       // Tess would adapt?
  BOOL8 done;                   // ready for output?
-  bool small_caps;             // word appears to be small caps
+  bool small_caps;              // word appears to be small caps
+  bool odd_size;                // word is bigger than line or leader dots.
  inT8 italic;
  inT8 bold;
  // The fontinfos are pointers to data owned by the classifier.
@ -486,6 +488,9 @@ class WERD_RES : public ELIST_LINK {
  // the word_to_debug.
  void DebugWordChoices(bool debug, const char* word_to_debug);

+  // Prints the top choice along with the accepted/done flags.
+  void DebugTopChoice(const char* msg) const;
+
  // Removes from best_choices all choices which are not within a reasonable
  // range of the best choice.
  void FilterWordChoices(int debug_level);
@ -694,6 +699,11 @@ class PAGE_RES_IT {
  // the resulting WERD_RES is returned for further setup with best_choice etc.
  WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);

+  // Replaces the current WERD/WERD_RES with the given words. The given words
+  // contain fake blobs that indicate the position of the characters. These are
+  // replaced with real blobs from the current word as much as possible.
+  void ReplaceCurrentWord(tesseract::PointerVector<WERD_RES>* words);
+
  // Deletes the current WERD_RES and its underlying WERD.
  void DeleteCurrentWord();

--- a/ccstruct/publictypes.h
+++ b/ccstruct/publictypes.h
@ -164,28 +164,37 @@ enum PageSegMode {
  PSM_SINGLE_CHAR,    ///< Treat the image as a single character.
  PSM_SPARSE_TEXT,    ///< Find as much text as possible in no particular order.
  PSM_SPARSE_TEXT_OSD,  ///< Sparse text with orientation and script det.
+  PSM_RAW_LINE,       ///< Treat the image as a single text line, bypassing
+                      ///< hacks that are Tesseract-specific.

  PSM_COUNT           ///< Number of enum entries.
 };

 /**
- * Macros that act on a PageSegMode to determine whether components of
+ * Inline functions that act on a PageSegMode to determine whether components of
 * layout analysis are enabled.
 * *Depend critically on the order of elements of PageSegMode.*
+ * NOTE that arg is an int for compatibility with INT_PARAM.
 */
-#define PSM_OSD_ENABLED(pageseg_mode) ((pageseg_mode) <= PSM_AUTO_OSD || \
-    (pageseg_mode) == PSM_SPARSE_TEXT_OSD)
-#define PSM_COL_FIND_ENABLED(pageseg_mode) \
-  ((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_AUTO)
-#define PSM_SPARSE(pageseg_mode) \
-  ((pageseg_mode) == PSM_SPARSE_TEXT || (pageseg_mode) == PSM_SPARSE_TEXT_OSD)
-#define PSM_BLOCK_FIND_ENABLED(pageseg_mode) \
-  ((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_COLUMN)
-#define PSM_LINE_FIND_ENABLED(pageseg_mode) \
-  ((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_BLOCK)
-#define PSM_WORD_FIND_ENABLED(pageseg_mode) \
-  (((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_LINE) || \
-   (pageseg_mode) == PSM_SPARSE_TEXT || (pageseg_mode) == PSM_SPARSE_TEXT_OSD)
+inline bool PSM_OSD_ENABLED(int pageseg_mode) {
+  return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}
+inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
+  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
+}
+inline bool PSM_SPARSE(int pageseg_mode) {
+  return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}
+inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
+  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
+}
+inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
+  return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
+}
+inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
+  return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
+      pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
+}

 /**
 * enum of the elements of the page hierarchy, used in ResultIterator
--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@ -48,11 +48,11 @@ class BLOB_CHOICE: public ELIST_LINK
 {
  public:
    BLOB_CHOICE() {
-      unichar_id_ = INVALID_UNICHAR_ID;
+      unichar_id_ = UNICHAR_SPACE;
      fontinfo_id_ = -1;
      fontinfo_id2_ = -1;
-      rating_ = MAX_FLOAT32;
-      certainty_ = -MAX_FLOAT32;
+      rating_ = 10.0;
+      certainty_ = -1.0;
      script_id_ = -1;
      xgap_before_ = 0;
      xgap_after_ = 0;
--- a/ccstruct/rect.h
+++ b/ccstruct/rect.h
@ -78,6 +78,12 @@ class DLLSYM TBOX  {  // bounding box
    void set_right(int x) {
      top_right.set_x(x);
    }
+    int x_middle() const {
+      return (bot_left.x() + top_right.x()) / 2;
+    }
+    int y_middle() const {
+      return (bot_left.y() + top_right.y()) / 2;
+    }

    const ICOORD &botleft() const {  // access function
      return bot_left;
--- a/ccstruct/stepblob.cpp
+++ b/ccstruct/stepblob.cpp
@ -247,10 +247,11 @@ C_BLOB* C_BLOB::FakeBlob(const TBOX& box) {
 * Return the bounding box of the blob.
 **********************************************************************/

-TBOX C_BLOB::bounding_box() {  //bounding box
-  C_OUTLINE *outline;            //current outline
-  C_OUTLINE_IT it = &outlines;   //outlines of blob
-  TBOX box;                       //bounding box
+TBOX C_BLOB::bounding_box() const {  // bounding box
+  C_OUTLINE *outline;                // current outline
+  // This is a read-only iteration of the outlines.
+  C_OUTLINE_IT it = const_cast<C_OUTLINE_LIST*>(&outlines);
+  TBOX box;                          // bounding box

  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    outline = it.data ();
--- a/ccstruct/stepblob.h
+++ b/ccstruct/stepblob.h
@ -65,7 +65,7 @@ class C_BLOB:public ELIST_LINK
      return &outlines;
    }

-    TBOX bounding_box();  //compute bounding box
+    TBOX bounding_box() const;  // compute bounding box
    inT32 area();  //compute area
    inT32 perimeter();  // Total perimeter of outlines and 1st level children.
    inT32 outer_area();  //compute area
@ -116,6 +116,14 @@ class C_BLOB:public ELIST_LINK
      return blob;
    }

+    static int SortByXMiddle(const void *v1, const void *v2) {
+      const C_BLOB* blob1 = *reinterpret_cast<const C_BLOB* const *>(v1);
+      const C_BLOB* blob2 = *reinterpret_cast<const C_BLOB* const *>(v2);
+      return blob1->bounding_box().x_middle() -
+             blob2->bounding_box().x_middle();
+    }
+
+
  private:
    C_OUTLINE_LIST outlines;     //master elements
 };
--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@ -17,15 +17,17 @@
 //
 ///////////////////////////////////////////////////////////////////////

+#include "unicharset.h"
+
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>

+#include "params.h"
+#include "serialis.h"
 #include "tesscallback.h"
 #include "tprintf.h"
 #include "unichar.h"
-#include "unicharset.h"
-#include "params.h"

 // Special character used in representing character fragments.
 static const char kSeparator = '|';
@ -448,11 +450,19 @@ void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
  }
 }

-// Makes this a copy of src. Clears this completely first, so the automattic
-// ids will not be present in this if not in src.
+// Makes this a copy of src. Clears this completely first, so the automatic
+// ids will not be present in this if not in src. Does NOT reorder the set!
 void UNICHARSET::CopyFrom(const UNICHARSET& src) {
  clear();
-  AppendOtherUnicharset(src);
+  for (int ch = 0; ch < src.size_used; ++ch) {
+    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
+    const char* utf8 = src.id_to_unichar(ch);
+    unichar_insert(utf8);
+    unichars[ch].properties.ExpandRangesFrom(src_props);
+  }
+  // Set properties, including mirror and other_case, WITHOUT reordering
+  // the unicharset.
+  PartialSetPropertiesFromOther(0, src);
 }

 // For each id in src, if it does not occur in this, add it, as in
@ -689,8 +699,11 @@ bool UNICHARSET::eq(UNICHAR_ID unichar_id,
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
 }

-bool UNICHARSET::save_to_file(FILE *file) const {
-  fprintf(file, "%d\n", this->size());
+bool UNICHARSET::save_to_string(STRING *str) const {
+  const int kFileBufSize = 1024;
+  char buffer[kFileBufSize + 1];
+  snprintf(buffer, kFileBufSize, "%d\n", this->size());
+  *str = buffer;
  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
    int min_bottom, max_bottom, min_top, max_top;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
@ -702,11 +715,11 @@ bool UNICHARSET::save_to_file(FILE *file) const {
    get_advance_range(id, &min_advance, &max_advance);
    unsigned int properties = this->get_properties(id);
    if (strcmp(this->id_to_unichar(id), " ") == 0) {
-      fprintf(file, "%s %x %s %d\n", "NULL", properties,
+      snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id));
    } else {
-      fprintf(file,
+      snprintf(buffer, kFileBufSize,
              "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
              this->id_to_unichar(id), properties,
              min_bottom, max_bottom, min_top, max_top, min_width, max_width,
@ -716,10 +729,12 @@ bool UNICHARSET::save_to_file(FILE *file) const {
              this->get_mirror(id), this->get_normed_unichar(id),
              this->debug_str(id).string());
    }
+    *str += buffer;
  }
  return true;
 }

+// TODO(rays) Replace with TFile everywhere.
 class InMemoryFilePointer {
 public:
  InMemoryFilePointer(const char *memory, int mem_size)
@ -776,6 +791,14 @@ bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
  return success;
 }

+bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
+  TessResultCallback2<char *, char *, int> *fgets_cb =
+      NewPermanentTessCallback(file, &tesseract::TFile::FGets);
+  bool success = load_via_fgets(fgets_cb, skip_fragments);
+  delete fgets_cb;
+  return success;
+}
+
 bool UNICHARSET::load_via_fgets(
    TessResultCallback2<char *, char *, int> *fgets_cb,
    bool skip_fragments) {
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@ -23,6 +23,7 @@
 #include "errcode.h"
 #include "genericvector.h"
 #include "helpers.h"
+#include "serialis.h"
 #include "strngs.h"
 #include "tesscallback.h"
 #include "unichar.h"
@ -317,7 +318,22 @@ class UNICHARSET {

  // Saves the content of the UNICHARSET to the given file.
  // Returns true if the operation is successful.
-  bool save_to_file(FILE *file) const;
+  bool save_to_file(FILE *file) const {
+    STRING str;
+    if (!save_to_string(&str)) return false;
+    if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
+    return true;
+  }
+  bool save_to_file(tesseract::TFile *file) const {
+    STRING str;
+    if (!save_to_string(&str)) return false;
+    if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
+    return true;
+  }
+
+  // Saves the content of the UNICHARSET to the given STRING.
+  // Returns true if the operation is successful.
+  bool save_to_string(STRING *str) const;

  // Load a unicharset from a unicharset file that has been loaded into
  // the given memory buffer.
@ -348,6 +364,8 @@ class UNICHARSET {
  // Returns true if the operation is successful.
  bool load_from_file(FILE *file, bool skip_fragments);
  bool load_from_file(FILE *file) { return load_from_file(file, false); }
+  bool load_from_file(tesseract::TFile *file, bool skip_fragments);
+

  // Sets up internal data after loading the file, based on the char
  // properties. Called from load_from_file, but also needs to be run
--- a/textord/makerow.cpp
+++ b/textord/makerow.cpp
@ -161,7 +161,8 @@ float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob, TO_ROW_IT* row_it) {
 * only a single blob, it makes 2 rows, in case the top-level blob
 * is a container of the real blobs to recognize.
 */
-float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
+float make_single_row(ICOORD page_tr, bool allow_sub_blobs,
+                      TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
  BLOBNBOX_IT blob_it = &block->blobs;
  TO_ROW_IT row_it = block->get_rows();

@ -169,11 +170,17 @@ float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
  blob_it.add_list_after(&block->small_blobs);
  blob_it.add_list_after(&block->noise_blobs);
  blob_it.add_list_after(&block->large_blobs);
-  if (block->blobs.singleton()) {
+  if (block->blobs.singleton() && allow_sub_blobs) {
    blob_it.move_to_first();
    float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
    if (size > block->line_size)
      block->line_size = size;
+  } else if (block->blobs.empty()) {
+    // Make a fake blob.
+    C_BLOB* blob = C_BLOB::FakeBlob(block->block->bounding_box());
+    // The blobnbox owns the blob.
+    BLOBNBOX* bblob = new BLOBNBOX(blob);
+    blob_it.add_after_then_move(bblob);
  }
  MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
  // Fit an LMS line to the rows.
--- a/textord/makerow.h
+++ b/textord/makerow.h
@ -133,7 +133,7 @@ inline bool within_error_margin(float test, float num, float margin) {
 void fill_heights(TO_ROW *row, float gradient, int min_height,
                  int max_height, STATS *heights, STATS *floating_heights);

-float make_single_row(ICOORD page_tr, TO_BLOCK* block,
+float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK* block,
                      TO_BLOCK_LIST* blocks);
 float make_rows(ICOORD page_tr,              // top right
                TO_BLOCK_LIST *port_blocks);
--- a/textord/textord.cpp
+++ b/textord/textord.cpp
@ -317,8 +317,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
  if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
    gradient = make_rows(page_tr_, to_blocks);
  } else if (!PSM_SPARSE(pageseg_mode)) {
-    // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
-    gradient = make_single_row(page_tr_, to_block, to_blocks);
+    // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
+    gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE,
+                               to_block, to_blocks);
  }
  BaselineDetect baseline_detector(textord_baseline_debug,
                                   reskew, to_blocks);
@ -339,7 +340,8 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
    make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                     to_block->get_rows(), to_block->block->row_list());
  }
-  cleanup_blocks(blocks);  // Remove empties.
+  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
+  // Remove empties.

  // Compute the margins for each row in the block, to be used later for
  // paragraph detection.
--- a/textord/textord.h
+++ b/textord/textord.h
@ -206,7 +206,7 @@ class Textord {
  // Must have at least one WERD.
  // WERDs contain a fake blob.
  void cleanup_nontext_block(BLOCK* block);
-  void cleanup_blocks(BLOCK_LIST *blocks);
+  void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
  BOOL8 clean_noise_from_row(ROW *row);
  void clean_noise_from_words(ROW *row);
  // Remove outlines that are a tiny fraction in either width or height
--- a/textord/tordmain.cpp
+++ b/textord/tordmain.cpp
@ -360,9 +360,11 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
  // Non-text blocks must contain at least one row.
  ROW_IT row_it(block->row_list());
  if (row_it.empty()) {
-    float height = block->bounding_box().height();
-    inT32 zero = 0;
-    ROW* row = new ROW(0, &zero, NULL, height / 2.0f, height / 4.0f,
+    TBOX box = block->bounding_box();
+    float height = box.height();
+    inT32 xstarts[2] = {box.left(), box.right()};
+    double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
+    ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
                       height / 4.0f, 0, 1);
    row_it.add_after_then_move(row);
  }
@ -398,9 +400,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
 * Delete empty blocks, rows from the page.
 **********************************************************************/

-void Textord::cleanup_blocks(                    //remove empties
-                             BLOCK_LIST *blocks  //list
-                            ) {
+void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
  BLOCK_IT block_it = blocks;    //iterator
  ROW_IT row_it;                 //row iterator

@ -417,22 +417,24 @@ void Textord::cleanup_blocks(                    //remove empties
    }
    num_rows = 0;
    num_rows_all = 0;
-    row_it.set_to_list(block->row_list());
-    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
-      ++num_rows_all;
-      clean_small_noise_from_words(row_it.data());
-      if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
-           clean_noise_from_row(row_it.data())) ||
-          row_it.data()->word_list()->empty()) {
-        delete row_it.extract();  // lose empty row.
-      } else {
-        if (textord_noise_rejwords)
-          clean_noise_from_words(row_it.data());
-        if (textord_blshift_maxshift >= 0)
-          tweak_row_baseline(row_it.data(),
-                             textord_blshift_maxshift,
-                             textord_blshift_xfraction);
-        ++num_rows;
+    if (clean_noise) {
+      row_it.set_to_list(block->row_list());
+      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+        ++num_rows_all;
+        clean_small_noise_from_words(row_it.data());
+        if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
+             clean_noise_from_row(row_it.data())) ||
+            row_it.data()->word_list()->empty()) {
+          delete row_it.extract();  // lose empty row.
+        } else {
+          if (textord_noise_rejwords)
+            clean_noise_from_words(row_it.data());
+          if (textord_blshift_maxshift >= 0)
+            tweak_row_baseline(row_it.data(),
+                               textord_blshift_maxshift,
+                               textord_blshift_xfraction);
+          ++num_rows;
+        }
      }
    }
    if (block->row_list()->empty()) {
--- a/wordrec/language_model.cpp
+++ b/wordrec/language_model.cpp
@ -299,7 +299,7 @@ bool LanguageModel::UpdateState(
    //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
    UNICHAR_ID unichar_id = choice->unichar_id();
    if (unicharset.get_fragment(unichar_id)) {
-      continue;  // skip fragments
+      continue;  // Skip fragments.
    }
    // Set top choice flags.
    LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
@ -651,6 +651,8 @@ bool LanguageModel::AddViterbiStateEntry(
      ngram_info, (language_model_debug_level > 0) ?
          dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
  new_vse->cost = ComputeAdjustedPathCost(new_vse);
+  if (language_model_debug_level >= 3)
+    tprintf("Adjusted cost = %g\n", new_vse->cost);

  // Invoke Top Choice language model component to make the final adjustments
  // to new_vse->top_choice_flags.
@ -1311,7 +1313,7 @@ void LanguageModel::UpdateBestChoice(
          vse->dawg_info != NULL && vse->top_choice_flags);
    }
  }
-  if (wordrec_display_segmentations) {
+  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
    word->DisplaySegmentation(word_res->chopped_word);
  }
 }
--- a/wordrec/segsearch.cpp
+++ b/wordrec/segsearch.cpp
@ -37,52 +37,16 @@ void Wordrec::DoSegSearch(WERD_RES* word_res) {
 void Wordrec::SegSearch(WERD_RES* word_res,
                        BestChoiceBundle* best_choice_bundle,
                        BlamerBundle* blamer_bundle) {
-  if (segsearch_debug_level > 0) {
-    tprintf("Starting SegSearch on ratings matrix%s:\n",
-            wordrec_enable_assoc ? " (with assoc)" : "");
-    word_res->ratings->print(getDict().getUnicharset());
-  }
  LMPainPoints pain_points(segsearch_max_pain_points,
                           segsearch_max_char_wh_ratio,
                           assume_fixed_pitch_char_segment,
                           &getDict(), segsearch_debug_level);
-
-  pain_points.GenerateInitial(word_res);
-
  // Compute scaling factor that will help us recover blob outline length
  // from classifier rating and certainty for the blob.
  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
-
-  language_model_->InitForWord(prev_word_best_choice_,
-                               assume_fixed_pitch_char_segment,
-                               segsearch_max_char_wh_ratio, rating_cert_scale);
-
-  // Initialize blamer-related information: map character boxes recorded in
-  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
-  // ratings matrix. We expect this step to succeed, since when running the
-  // chopper we checked that the correct chops are present.
-  if (blamer_bundle != NULL) {
-    blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
-                                            wordrec_debug_blamer);
-  }
-
-  MATRIX_COORD pain_point;
-  float pain_point_priority;
-
-  // pending[col] tells whether there is update work to do to combine
-  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
-  // As the language model state is updated, pending entries are modified to
-  // minimize duplication of work. It is important that during the update the
-  // children are considered in the non-decreasing order of their column, since
-  // this guarantees that all the parents would be up to date before an update
-  // of a child is done.
  GenericVector<SegSearchPending> pending;
-  pending.init_to_size(word_res->ratings->dimension(), SegSearchPending());
-
-  // Search the ratings matrix for the initial best path.
-  pending[0].SetColumnClassified();
-  UpdateSegSearchNodes(rating_cert_scale, 0, &pending, word_res,
-                       &pain_points, best_choice_bundle, blamer_bundle);
+  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
+                   blamer_bundle);

  if (!SegSearchDone(0)) {  // find a better choice
    if (chop_enable && word_res->chopped_word != NULL) {
@ -98,6 +62,9 @@ void Wordrec::SegSearch(WERD_RES* word_res,
    }
  }
  // Keep trying to find a better path by fixing the "pain points".
+
+  MATRIX_COORD pain_point;
+  float pain_point_priority;
  int num_futile_classifications = 0;
  STRING blamer_debug;
  while (wordrec_enable_assoc &&
@ -159,6 +126,72 @@ void Wordrec::SegSearch(WERD_RES* word_res,
  }
 }

+// Setup and run just the initial segsearch on an established matrix,
+// without doing any additional chopping or joining.
+void Wordrec::WordSearch(WERD_RES* word_res) {
+  LMPainPoints pain_points(segsearch_max_pain_points,
+                           segsearch_max_char_wh_ratio,
+                           assume_fixed_pitch_char_segment,
+                           &getDict(), segsearch_debug_level);
+  GenericVector<SegSearchPending> pending;
+  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
+  // Run Segmentation Search.
+  InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
+  if (segsearch_debug_level > 0) {
+    tprintf("Ending ratings matrix%s:\n",
+            wordrec_enable_assoc ? " (with assoc)" : "");
+    word_res->ratings->print(getDict().getUnicharset());
+  }
+}
+
+
+// Setup and run just the initial segsearch on an established matrix,
+// without doing any additional chopping or joining.
+// (Internal factored version that can be used as part of the main SegSearch.)
+void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
+                               GenericVector<SegSearchPending>* pending,
+                               BestChoiceBundle* best_choice_bundle,
+                               BlamerBundle* blamer_bundle) {
+  if (segsearch_debug_level > 0) {
+    tprintf("Starting SegSearch on ratings matrix%s:\n",
+            wordrec_enable_assoc ? " (with assoc)" : "");
+    word_res->ratings->print(getDict().getUnicharset());
+  }
+
+  pain_points->GenerateInitial(word_res);
+
+  // Compute scaling factor that will help us recover blob outline length
+  // from classifier rating and certainty for the blob.
+  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
+
+  language_model_->InitForWord(prev_word_best_choice_,
+                               assume_fixed_pitch_char_segment,
+                               segsearch_max_char_wh_ratio, rating_cert_scale);
+
+  // Initialize blamer-related information: map character boxes recorded in
+  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
+  // ratings matrix. We expect this step to succeed, since when running the
+  // chopper we checked that the correct chops are present.
+  if (blamer_bundle != NULL) {
+    blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
+                                            wordrec_debug_blamer);
+  }
+
+  // pending[col] tells whether there is update work to do to combine
+  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
+  // As the language model state is updated, pending entries are modified to
+  // minimize duplication of work. It is important that during the update the
+  // children are considered in the non-decreasing order of their column, since
+  // this guarantees that all the parents would be up to date before an update
+  // of a child is done.
+  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
+
+  // Search the ratings matrix for the initial best path.
+  (*pending)[0].SetColumnClassified();
+  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
+                       pain_points, best_choice_bundle, blamer_bundle);
+}
+
 void Wordrec::UpdateSegSearchNodes(
    float rating_cert_scale,
    int starting_col,
--- a/wordrec/wordrec.h
+++ b/wordrec/wordrec.h
@ -266,11 +266,22 @@ class Wordrec : public Classify {
  // to combine blobs. Segmentation search will run only one "iteration"
  // on the classifications already recorded in chunks_record.ratings.
  //
-  // Note: this function assumes that word, output_best_state,
-  // best_char_choices and fixpt arguments are not NULL.
+  // Note: this function assumes that word_res, best_choice_bundle arguments
+  // are not NULL.
  void SegSearch(WERD_RES* word_res,
                 BestChoiceBundle* best_choice_bundle,
                 BlamerBundle* blamer_bundle);
+  // Setup and run just the initial segsearch on an established matrix,
+  // without doing any additional chopping or joining.
+  void WordSearch(WERD_RES* word_res);
+
+  // Setup and run just the initial segsearch on an established matrix,
+  // without doing any additional chopping or joining.
+  // (Internal factored version that can be used as part of the main SegSearch.)
+  void InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
+                        GenericVector<SegSearchPending>* pending,
+                        BestChoiceBundle* best_choice_bundle,
+                        BlamerBundle* blamer_bundle);

  // Runs SegSearch() function (above) without needing a best_choice_bundle
  // or blamer_bundle. Used for testing.