Improved sub/superscript treatment

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@872 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-11 12:43:17 +08:00 · 2013-09-20 19:49:47 +00:00 · 2013-09-20 19:49:47 +00:00 · 2aafc9df24
commit 2aafc9df24
parent 96c662ed6e
3 changed files with 872 additions and 53 deletions
--- a/ccmain/superscript.cpp
+++ b/ccmain/superscript.cpp
@ -0,0 +1,610 @@
 /******************************************************************
 * File:        superscript.cpp
 * Description: Correction pass to fix superscripts and subscripts.
 * Author:      David Eger
 * Created:     Mon Mar 12 14:05:00 PDT 2012
 *
 * (C) Copyright 2012, Google, Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/
 #include "normalis.h"
 #include "tesseractclass.h"
 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
  int num_chopped = 0;
  for (int i = 0; i < num_unichars; i++)
    num_chopped += word->best_state[i];
  return num_chopped;
 }
 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
  int num_chopped = 0;
  for (int i = 0; i < num_unichars; i++)
    num_chopped += word->best_state[word->best_state.size() - 1 - i];
  return num_chopped;
 }
 namespace tesseract {
 /**
 * Given a recognized blob, see if a contiguous collection of sub-pieces
 * (chopped blobs) starting at its left might qualify as being a subscript
 * or superscript letter based only on y position.  Also do this for the
 * right side.
 */
 void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index,
                    int super_y_bottom, int sub_y_top,
                    ScriptPos *leading_pos, int *num_leading_outliers,
                    ScriptPos *trailing_pos, int *num_trailing_outliers) {
  ScriptPos sp_unused1, sp_unused2;
  int unused1, unused2;
  if (!leading_pos) leading_pos = &sp_unused1;
  if (!num_leading_outliers) num_leading_outliers = &unused1;
  if (!trailing_pos) trailing_pos = &sp_unused2;
  if (!num_trailing_outliers) num_trailing_outliers = &unused2;
  *num_leading_outliers = *num_trailing_outliers = 0;
  *leading_pos = *trailing_pos = SP_NORMAL;
  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
  int num_chopped_pieces = word->best_state[rebuilt_blob_index];
  ScriptPos last_pos = SP_NORMAL;
  int trailing_outliers = 0;
  for (int i = 0; i < num_chopped_pieces; i++) {
    TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
    ScriptPos pos = SP_NORMAL;
    if (box.bottom() >= super_y_bottom) {
      pos = SP_SUPERSCRIPT;
    } else if (box.top() <= sub_y_top) {
      pos = SP_SUBSCRIPT;
    }
    if (pos == SP_NORMAL) {
      if (trailing_outliers == i) {
        *num_leading_outliers = trailing_outliers;
        *leading_pos = last_pos;
      }
      trailing_outliers = 0;
    } else {
      if (pos == last_pos) {
        trailing_outliers++;
      } else {
        trailing_outliers = 1;
      }
    }
    last_pos = pos;
  }
  *num_trailing_outliers = trailing_outliers;
  *trailing_pos = last_pos;
 }
 /**
 * Attempt to split off any high (or low) bits at the ends of the word with poor
 * certainty and recognize them separately.  If the certainty gets much better
 * and other sanity checks pass, acccept.
 *
 * This superscript fix is meant to be called in the second pass of recognition
 * when we have tried once and already have a preliminary answer for word.
 *
 * @return Whether we modified the given word.
 */
 bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
      !word->best_choice) {
    return false;
  }
  int num_leading, num_trailing;
  ScriptPos sp_leading, sp_trailing;
  float leading_certainty, trailing_certainty;
  float avg_certainty, unlikely_threshold;
  // Calculate the number of whole suspicious characters at the edges.
  GetSubAndSuperscriptCandidates(
          word, &num_leading, &sp_leading, &leading_certainty,
          &num_trailing, &sp_trailing, &trailing_certainty,
          &avg_certainty, &unlikely_threshold);
  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
  int num_blobs = word->best_choice->length();
  // Calculate the remainder (partial characters) at the edges.
  // This accounts for us having classified the best version of
  // a word as [speaker?'] when it was instead [speaker.^{21}]
  // (that is we accidentally thought the 2 was attached to the period).
  int num_remainder_leading = 0, num_remainder_trailing = 0;
  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
    int super_y_bottom =
        kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
    int sub_y_top =
        kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
    int last_word_char = num_blobs - 1 - num_trailing;
    float last_char_certainty = word->best_choice->certainty(last_word_char);
    if (word->best_choice->unichar_id(last_word_char) != 0 &&
        last_char_certainty <= unlikely_threshold) {
      ScriptPos rpos;
      YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
                     NULL, NULL, &rpos, &num_remainder_trailing);
      if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
      if (num_remainder_trailing > 0 &&
          last_char_certainty < trailing_certainty) {
        trailing_certainty = last_char_certainty;
      }
    }
    bool another_blob_available = (num_remainder_trailing == 0) ||
        num_leading + num_trailing + 1 < num_blobs;
    int first_char_certainty = word->best_choice->certainty(num_leading);
    if (another_blob_available &&
        word->best_choice->unichar_id(num_leading) != 0 &&
        first_char_certainty <= unlikely_threshold) {
      ScriptPos lpos;
      YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
                     &lpos, &num_remainder_leading, NULL, NULL);
      if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
      if (num_remainder_leading > 0 &&
          first_char_certainty < leading_certainty) {
        leading_certainty = first_char_certainty;
      }
    }
  }
  // If nothing to do, bail now.
  if (num_leading + num_trailing +
      num_remainder_leading + num_remainder_trailing == 0) {
    return false;
  }
  if (superscript_debug >= 1) {
    tprintf("Candidate for superscript detection: %s (",
            word->best_choice->unichar_string().string());
    if (num_leading || num_remainder_leading) {
      tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
              leading_pos);
    }
    if (num_trailing || num_remainder_trailing) {
      tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
              trailing_pos);
    }
    tprintf(")\n");
  }
  if (superscript_debug >= 3) {
    word->best_choice->print();
  }
  if (superscript_debug >= 2) {
    tprintf(" Certainties -- Average: %.2f  Unlikely thresh: %.2f  ",
            avg_certainty, unlikely_threshold);
    if (num_leading)
      tprintf("Orig. leading (min): %.2f  ", leading_certainty);
    if (num_trailing)
      tprintf("Orig. trailing (min): %.2f  ", trailing_certainty);
    tprintf("\n");
  }
  // We've now calculated the number of rebuilt blobs we want to carve off.
  // However, split_word() works from TBLOBs in chopped_word, so we need to
  // convert to those.
  int num_chopped_leading =
      LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
  int num_chopped_trailing =
      TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
  int retry_leading = 0;
  int retry_trailing = 0;
  bool is_good = false;
  WERD_RES *revised = TrySuperscriptSplits(
      num_chopped_leading, leading_certainty, sp_leading,
      num_chopped_trailing, trailing_certainty, sp_trailing,
      word, &is_good, &retry_leading, &retry_trailing);
  if (is_good) {
    word->ConsumeWordResults(revised);
  } else if (retry_leading || retry_trailing) {
    int retry_chopped_leading =
        LeadingUnicharsToChopped(revised, retry_leading);
    int retry_chopped_trailing =
        TrailingUnicharsToChopped(revised, retry_trailing);
    WERD_RES *revised2 = TrySuperscriptSplits(
        retry_chopped_leading, leading_certainty, sp_leading,
        retry_chopped_trailing, trailing_certainty, sp_trailing,
        revised, &is_good, &retry_leading, &retry_trailing);
    if (is_good) {
      word->ConsumeWordResults(revised2);
    }
    delete revised2;
  }
  delete revised;
  return is_good;
 }
 /**
 * Determine how many characters (rebuilt blobs) on each end of a given word
 * might plausibly be superscripts so SubAndSuperscriptFix can try to
 * re-recognize them.  Even if we find no whole blobs at either end,
 * we will set *unlikely_threshold to a certainty that might be used to
 * select "bad enough" outlier characters.  If *unlikely_threshold is set to 0,
 * though, there's really no hope.
 *
 * @param[in]  word    The word to examine.
 * @param[out] num_rebuilt_leading   the number of rebuilt blobs at the start
 *                                   of the word which are all up or down and
 *                                   seem badly classified.
 * @param[out] leading_pos        "super" or "sub" (for debugging)
 * @param[out] leading_certainty  the worst certainty in the leading blobs.
 * @param[out] num_rebuilt_trailing   the number of rebuilt blobs at the end
 *                                    of the word which are all up or down and
 *                                    seem badly classified.
 * @param[out] trailing_pos        "super" or "sub" (for debugging)
 * @param[out] trailing_certainty  the worst certainty in the trailing blobs.
 * @param[out] avg_certainty       the average certainty of "normal" blobs in
 *                                 the word.
 * @param[out] unlikely_threshold  the threshold (on certainty) we used to
 *                                 select "bad enough" outlier characters.
 */
 void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word,
                                               int *num_rebuilt_leading,
                                               ScriptPos *leading_pos,
                                               float *leading_certainty,
                                               int *num_rebuilt_trailing,
                                               ScriptPos *trailing_pos,
                                               float *trailing_certainty,
                                               float *avg_certainty,
                                               float *unlikely_threshold) {
  *avg_certainty = *unlikely_threshold = 0.0f;
  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
  *leading_certainty = *trailing_certainty = 0.0f;
  int super_y_bottom =
      kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
  int sub_y_top =
      kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
  // Step one: Get an average certainty for "normally placed" characters.
  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
  *leading_pos = *trailing_pos = SP_NORMAL;
  int leading_outliers = 0;
  int trailing_outliers = 0;
  int num_normal = 0;
  float normal_certainty_total = 0.0f;
  float worst_normal_certainty = 0.0f;
  ScriptPos last_pos = SP_NORMAL;
  int num_blobs = word->rebuild_word->NumBlobs();
  for (int b = 0; b < num_blobs; ++b) {
    TBOX box = word->rebuild_word->blobs[b]->bounding_box();
    ScriptPos pos = SP_NORMAL;
    if (box.bottom() >= super_y_bottom) {
      pos = SP_SUPERSCRIPT;
    } else if (box.top() <= sub_y_top) {
      pos = SP_SUBSCRIPT;
    }
    if (pos == SP_NORMAL) {
      if (word->best_choice->unichar_id(b) != 0) {
        float char_certainty = word->best_choice->certainty(b);
        if (char_certainty < worst_normal_certainty) {
          worst_normal_certainty = char_certainty;
        }
        num_normal++;
        normal_certainty_total += char_certainty;
      }
      if (trailing_outliers == b) {
        leading_outliers = trailing_outliers;
        *leading_pos = last_pos;
      }
      trailing_outliers = 0;
    } else {
      if (last_pos == pos) {
        trailing_outliers++;
      } else {
        trailing_outliers = 1;
      }
    }
    last_pos = pos;
  }
  *trailing_pos = last_pos;
  if (num_normal >= 3) {  // throw out the worst as an outlier.
    num_normal--;
    normal_certainty_total -= worst_normal_certainty;
  }
  if (num_normal > 0) {
    *avg_certainty = normal_certainty_total / num_normal;
    *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
  }
  if (num_normal == 0 ||
      (leading_outliers == 0 && trailing_outliers == 0)) {
    return;
  }
  // Step two: Try to split off bits of the word that are both outliers
  //           and have much lower certainty than average
  // Calculate num_leading and leading_certainty.
  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
       *num_rebuilt_leading < leading_outliers;
       (*num_rebuilt_leading)++) {
    float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
    if (char_certainty > *unlikely_threshold) {
      break;
    }
    if (char_certainty < *leading_certainty) {
      *leading_certainty = char_certainty;
    }
  }
  // Calculate num_trailing and trailing_certainty.
  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
       *num_rebuilt_trailing < trailing_outliers;
       (*num_rebuilt_trailing)++) {
    int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
    float char_certainty = word->best_choice->certainty(blob_idx);
    if (char_certainty > *unlikely_threshold) {
      break;
    }
    if (char_certainty < *trailing_certainty) {
      *trailing_certainty = char_certainty;
    }
  }
 }
 /**
 * Try splitting off the given number of (chopped) blobs from the front and
 * back of the given word and recognizing the pieces.
 *
 * @param[in]  num_chopped_leading   how many chopped blobs from the left
 *                    end of the word to chop off and try recognizing as a
 *                    superscript (or subscript)
 * @param[in]  leading_certainty     the (minimum) certainty had by the
 *                    characters in the original leading section.
 * @param[in]  leading_pos    "super" or "sub" (for debugging)
 * @param[in]  num_chopped_trailing  how many chopped blobs from the right
 *                    end of the word to chop off and try recognizing as a
 *                    superscript (or subscript)
 * @param[in]  trailing_certainty    the (minimum) certainty had by the
 *                    characters in the original trailing section.
 * @param[in]  trailing_pos      "super" or "sub" (for debugging)
 * @param[in]  word              the word to try to chop up.
 * @param[out] is_good           do we believe our result?
 * @param[out] retry_rebuild_leading, retry_rebuild_trailing
 *         If non-zero, and !is_good, then the caller may have luck trying
 *         to split the returned word with this number of (rebuilt) leading
 *         and trailing blobs / unichars.
 * @return A word which is the result of re-recognizing as asked.
 */
 WERD_RES *Tesseract::TrySuperscriptSplits(
    int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
    int num_chopped_trailing, float trailing_certainty,
    ScriptPos trailing_pos,
    WERD_RES *word,
    bool *is_good,
    int *retry_rebuild_leading, int *retry_rebuild_trailing) {
  int num_chopped = word->chopped_word->NumBlobs();
  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
  // Chop apart the word into up to three pieces.
  BlamerBundle *bb0 = NULL;
  BlamerBundle *bb1 = NULL;
  WERD_RES *prefix = NULL;
  WERD_RES *core = NULL;
  WERD_RES *suffix = NULL;
  if (num_chopped_leading > 0) {
    prefix = new WERD_RES(*word);
    split_word(prefix, num_chopped_leading, &core, &bb0);
  } else {
    core = new WERD_RES(*word);
  }
  if (num_chopped_trailing > 0) {
    int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
    split_word(core, split_pt, &suffix, &bb1);
  }
  //  Recognize the pieces in turn.
  int saved_cp_multiplier = classify_class_pruner_multiplier;
  int saved_im_multiplier = classify_integer_matcher_multiplier;
  if (prefix) {
    // Turn off Tesseract's y-position penalties for the leading superscript.
    classify_class_pruner_multiplier.set_value(0);
    classify_integer_matcher_multiplier.set_value(0);
    // Adjust our expectations about the baseline for this prefix.
    if (superscript_debug >= 3) {
      tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
    }
    recog_word_recursive(prefix);
    if (superscript_debug >= 2) {
      tprintf(" The leading bits look like %s %s\n",
              ScriptPosToString(leading_pos),
              prefix->best_choice->unichar_string().string());
    }
    // Restore the normal y-position penalties.
    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
  }
  if (superscript_debug >= 3) {
    tprintf(" recognizing middle %d chopped blobs\n",
            num_chopped - num_chopped_leading - num_chopped_trailing);
  }
  if (suffix) {
    // Turn off Tesseract's y-position penalties for the trailing superscript.
    classify_class_pruner_multiplier.set_value(0);
    classify_integer_matcher_multiplier.set_value(0);
    if (superscript_debug >= 3) {
      tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
    }
    recog_word_recursive(suffix);
    if (superscript_debug >= 2) {
      tprintf(" The trailing bits look like %s %s\n",
              ScriptPosToString(trailing_pos),
              suffix->best_choice->unichar_string().string());
    }
    // Restore the normal y-position penalties.
    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
  }
  // Evaluate whether we think the results are believably better
  // than what we already had.
  bool good_prefix = !prefix || BelievableSuperscript(
      superscript_debug >= 1, *prefix,
      superscript_bettered_certainty * leading_certainty,
      retry_rebuild_leading, NULL);
  bool good_suffix = !suffix || BelievableSuperscript(
      superscript_debug >= 1, *suffix,
      superscript_bettered_certainty * trailing_certainty,
      NULL, retry_rebuild_trailing);
  *is_good = good_prefix && good_suffix;
  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
    // None of it is any good. Quit now.
    delete core;
    delete prefix;
    delete suffix;
    return NULL;
  }
  recog_word_recursive(core);
  // Now paste the results together into core.
  if (suffix) {
    suffix->SetAllScriptPositions(trailing_pos);
    join_words(core, suffix, bb1);
  }
  if (prefix) {
    prefix->SetAllScriptPositions(leading_pos);
    join_words(prefix, core, bb0);
    core = prefix;
    prefix = NULL;
  }
  if (superscript_debug >= 1) {
    tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
            core->best_choice->unichar_string().string());
  }
  return core;
 }
 /**
 * Return whether this is believable superscript or subscript text.
 *
 * We insist that:
 *   + there are no punctuation marks.
 *   + there are no italics.
 *   + no normal-sized character is smaller than superscript_scaledown_ratio
 *     of what it ought to be, and
 *   + each character is at least as certain as certainty_threshold.
 *
 *  @param[in]  debug  If true, spew debug output
 *  @param[in]  word   The word whose best_choice we're evaluating
 *  @param[in]  certainty_threshold   If any of the characters have less
 *                    certainty than this, reject.
 *  @param[out]  left_ok  How many left-side characters were ok?
 *  @param[out]  right_ok  How many right-side characters were ok?
 *  @return  Whether the complete best choice is believable as a superscript.
 */
 bool Tesseract::BelievableSuperscript(bool debug,
                                      const WERD_RES &word,
                                      float certainty_threshold,
                                      int *left_ok,
                                      int *right_ok) const {
  int initial_ok_run_count = 0;
  int ok_run_count = 0;
  float worst_certainty = 0.0f;
  const WERD_CHOICE &wc = *word.best_choice;
  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
  for (int i = 0; i < wc.length(); i++) {
    TBLOB *blob = word.rebuild_word->blobs[i];
    UNICHAR_ID unichar_id = wc.unichar_id(i);
    float char_certainty = wc.certainty(i);
    bool bad_certainty = char_certainty < certainty_threshold;
    bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
    bool is_italic = word.fontinfo && word.fontinfo->is_italic();
    BLOB_CHOICE *choice = word.GetBlobChoice(i);
    if (choice && fontinfo_table.size() > 0) {
      // Get better information from the specific choice, if available.
      int font_id1 = choice->fontinfo_id();
      bool font1_is_italic = font_id1 >= 0
          ? fontinfo_table.get(font_id1).is_italic() : false;
      int font_id2 = choice->fontinfo_id2();
      is_italic = font1_is_italic &&
          (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
    }
    float height_fraction = 1.0f;
    float char_height = blob->bounding_box().height();
    float normal_height = char_height;
    if (wc.unicharset()->top_bottom_useful()) {
      int min_bot, max_bot, min_top, max_top;
      wc.unicharset()->get_top_bottom(unichar_id,
                                      &min_bot, &max_bot,
                                      &min_top, &max_top);
      float hi_height = max_top - max_bot;
      float lo_height = min_top - min_bot;
      normal_height = (hi_height + lo_height) / 2;
      if (normal_height >= kBlnXHeight) {
        // Only ding characters that we have decent information for because
        // they're supposed to be normal sized, not tiny specks or dashes.
        height_fraction = char_height / normal_height;
      }
    }
    bool bad_height = height_fraction < superscript_scaledown_ratio;
    if (debug) {
      if (is_italic) {
        tprintf(" Rejecting: superscript is italic.\n");
      }
      if (is_punc) {
        tprintf(" Rejecting: punctuation present.\n");
      }
      const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
      if (bad_certainty) {
        tprintf(" Rejecting: don't believe character %s with certainty %.2f "
                "which is less than threshold %.2f\n", char_str,
                char_certainty, certainty_threshold);
      }
      if (bad_height) {
        tprintf(" Rejecting: character %s seems too small @ %.2f versus "
                "expected %.2f\n", char_str, char_height, normal_height);
      }
    }
    if (bad_certainty || bad_height || is_punc || is_italic) {
      if (ok_run_count == i) {
        initial_ok_run_count = ok_run_count;
      }
      ok_run_count = 0;
    } else {
      ok_run_count++;
    }
    if (char_certainty < worst_certainty) {
      worst_certainty = char_certainty;
    }
  }
  bool all_ok = ok_run_count == wc.length();
  if (all_ok && debug) {
    tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
  }
  if (!all_ok) {
    if (left_ok) *left_ok = initial_ok_run_count;
    if (right_ok) *right_ok = ok_run_count;
  }
  return all_ok;
 }
 }  // namespace tesseract
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@ -77,9 +77,6 @@ Tesseract::Tesseract()
              "documents while performing ocr.", this->params()),
    STRING_MEMBER(tessedit_write_params_to_file, "",
                  "Write all parameters to the given file.", this->params()),
    BOOL_MEMBER(tessedit_adapt_to_char_fragments, true,
                "Adapt to words that contain "
                " a character composed form fragments", this->params()),
    BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
                " information for adaption", this->params()),
    INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
@ -103,6 +100,8 @@ Tesseract::Tesseract()
                "Call Tess to learn blobs", this->params()),
    BOOL_MEMBER(tessedit_dump_choices, false,
                "Dump char choices", this->params()),
    BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
                this->params()),
    BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
                "Try to improve fuzzy spaces", this->params()),
    BOOL_MEMBER(tessedit_unrej_any_wd, false,
@ -117,7 +116,7 @@ Tesseract::Tesseract()
                "Output font info per char", this->params()),
    BOOL_MEMBER(tessedit_debug_block_rejection, false,
                "Block and Row stats", this->params()),
-    BOOL_MEMBER(tessedit_enable_bigram_correction, false,
+    BOOL_MEMBER(tessedit_enable_bigram_correction, true,
                "Enable correction based on the word bigram dictionary.",
                this->params()),
    INT_MEMBER(tessedit_bigram_debug, 0,
@ -142,8 +141,6 @@ Tesseract::Tesseract()
                  "good_quality_doc gte good char limit", this->params()),
    INT_MEMBER(quality_min_initial_alphas_reqd, 2,
               "alphas in a good word", this->params()),
    BOOL_MEMBER(tessedit_tess_adapt_to_rejmap, false,
                "Use reject map to control Tesseract adaption", this->params()),
    INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
               "Adaptation decision algorithm for tess", this->params()),
    BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
@ -154,14 +151,14 @@ Tesseract::Tesseract()
                "Log matcher activity", this->params()),
    INT_MEMBER(tessedit_test_adaption_mode, 3,
               "Adaptation decision algorithm for tess", this->params()),
    BOOL_MEMBER(save_blob_choices, false,
                "Save the results of the recognition step (blob_choices)"
                " within the corresponding WERD_CHOICE", this->params()),
    BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
    double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
    double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
    INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
               this->params()),
    BOOL_MEMBER(paragraph_text_based, true,
                "Run paragraph detection on the post-text-recognition "
                "(more accurate)", this->params()),
    INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
    STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
                  this->params()),
@ -282,6 +279,30 @@ Tesseract::Tesseract()
               this->params()),
    INT_MEMBER(x_ht_min_change, 8,
               "Min change in xht before actually trying it", this->params()),
    INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer",
               this->params()),
    double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse "
                  "certainty does a superscript position glyph need to be for "
                  "us to try classifying it as a char with a different "
                  "baseline?", this->params()),
    double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in "
                  "badness do we think sufficient to choose a superscript "
                  "over what we'd thought.  For example, a value of 0.6 means "
                  "we want to reduce badness of certainty by at least 40%",
                  this->params()),
    double_MEMBER(superscript_scaledown_ratio, 0.4,
                  "A superscript scaled down more than this is unbelievably "
                  "small.  For example, 0.3 means we expect the font size to "
                  "be no smaller than 30% of the text line font size.",
                  this->params()),
    double_MEMBER(subscript_max_y_top, 0.5,
                  "Maximum top of a character measured as a multiple of "
                  "x-height above the baseline for us to reconsider whether "
                  "it's a subscript.", this->params()),
    double_MEMBER(superscript_min_y_bottom, 0.3,
                  "Minimum bottom of a character measured as a multiple of "
                  "x-height above the baseline for us to reconsider whether "
                  "it's a superscript.", this->params()),
    BOOL_MEMBER(tessedit_write_block_separators, false,
                "Write block separators in output", this->params()),
    BOOL_MEMBER(tessedit_write_rep_codes, false,
@ -314,8 +335,6 @@ Tesseract::Tesseract()
    BOOL_MEMBER(tessedit_consistent_reps, true,
                "Force all rep chars the same", this->params()),
    INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
    INT_MEMBER(tessedit_ok_mode, 5,
               "Acceptance decision algorithm", this->params()),
    BOOL_MEMBER(tessedit_rejection_debug, false,
                "Adaption debug", this->params()),
    BOOL_MEMBER(tessedit_flip_0O, true,
@ -365,22 +384,111 @@ Tesseract::Tesseract()
               " TessdataManager functions.", this->params()),
    STRING_MEMBER(tessedit_load_sublangs, "",
                  "List of languages to load with this one", this->params()),
    BOOL_MEMBER(tessedit_use_primary_params_model, false,
                "In multilingual mode use params model of the"
                " primary language", this->params()),
    double_MEMBER(min_orientation_margin, 7.0,
                  "Min acceptable orientation margin", this->params()),
    BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
                this->params()),
    BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
                this->params()),
    BOOL_MEMBER(poly_allow_detailed_fx, false,
                "Allow feature extractors to see the original outline",
                this->params()),
    BOOL_INIT_MEMBER(tessedit_init_config_only, false,
                     "Only initialize with the config file. Useful if the "
                     "instance is not going to be used for OCR but say only "
                     "for layout analysis.", this->params()),
    BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
                this->params()),
    // The following parameters were deprecated and removed from their original
    // locations. The parameters are temporarily kept here to give Tesseract
    // users a chance to updated their [lang].traineddata and config files
    // without introducing failures during Tesseract initialization.
    // TODO(ocr-team): remove these parameters from the code once we are
    // reasonably sure that Tesseract users have updated their data files.
    //
    // BEGIN DEPRECATED PARAMETERS
    INT_MEMBER(tessedit_ok_mode, 5,
               "Acceptance decision algorithm", this->params()),
    BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
                     " (e.g. for non-space delimited languages)",
                     this->params()),
    INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
               this->params()),
    BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
                this->params()),
    double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of"
                  " current best rate to prune other hypotheses",
                  this->params()),
    BOOL_MEMBER(permute_script_word, 0,
                "Turn on word script consistency permuter",
                this->params()),
    BOOL_MEMBER(segment_segcost_rating, 0,
                "incorporate segmentation cost in word rating?",
                this->params()),
    double_MEMBER(segment_reward_script, 0.95,
                  "Score multipler for script consistency within a word. "
                  "Being a 'reward' factor, it should be <= 1. "
                  "Smaller value implies bigger reward.",
                  this->params()),
    BOOL_MEMBER(permute_fixed_length_dawg, 0,
                "Turn on fixed-length phrasebook search permuter",
                this->params()),
    BOOL_MEMBER(permute_chartype_word, 0,
                "Turn on character type (property) consistency permuter",
                this->params()),
    double_MEMBER(segment_reward_chartype, 0.97,
                  "Score multipler for char type consistency within a word. ",
                  this->params()),
    double_MEMBER(segment_reward_ngram_best_choice, 0.99,
                  "Score multipler for ngram permuter's best choice"
                  " (only used in the Han script path).",
                  this->params()),
    BOOL_MEMBER(ngram_permuter_activated, false,
                "Activate character-level n-gram-based permuter",
                this->params()),
    BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
                this->params()),
    INT_MEMBER(language_model_fixed_length_choices_depth, 3,
               "Depth of blob choice lists to explore"
               " when fixed length dawgs are on",
               this->params()),
    BOOL_MEMBER(use_new_state_cost, FALSE,
                "use new state cost heuristics for segmentation state"
                " evaluation", this->params()),
    double_MEMBER(heuristic_segcost_rating_base, 1.25,
                  "base factor for adding segmentation cost into word rating."
                  "It's a multiplying factor, the larger the value above 1, "
                  "the bigger the effect of segmentation cost.",
                  this->params()),
    double_MEMBER(heuristic_weight_rating, 1.0,
                  "weight associated with char rating in combined cost of"
                  "state", this->params()),
    double_MEMBER(heuristic_weight_width, 1000.0,
                  "weight associated with width evidence in combined cost of"
                  " state", this->params()),
    double_MEMBER(heuristic_weight_seamcut, 0.0,
                  "weight associated with seam cut in combined cost of state",
                  this->params()),
    double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
                  "max char width-to-height ratio allowed in segmentation",
                  this->params()),
    BOOL_MEMBER(enable_new_segsearch, true,
                "Enable new segmentation search path.", this->params()),
    double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
                  "Maximum character width-to-height ratio for"
                  " fixed-pitch fonts",
                  this->params()),
    // END DEPRECATED PARAMETERS
    backup_config_file_(NULL),
    pix_binary_(NULL),
    cube_binary_(NULL),
    pix_grey_(NULL),
    pix_thresholds_(NULL),
    source_resolution_(0),
    textord_(this),
    right_to_left_(false),
@ -414,6 +522,7 @@ void Tesseract::Clear() {
  pixDestroy(&pix_binary_);
  pixDestroy(&cube_binary_);
  pixDestroy(&pix_grey_);
  pixDestroy(&pix_thresholds_);
  pixDestroy(&scaled_color_);
  deskew_ = FCOORD(1.0f, 0.0f);
  reskew_ = FCOORD(1.0f, 0.0f);
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@ -181,6 +181,10 @@ class Tesseract : public Wordrec {
  Pix* BestPix() const {
    return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
  }
  void set_pix_thresholds(Pix* thresholds) {
    pixDestroy(&pix_thresholds_);
    pix_thresholds_ = thresholds;
  }
  int source_resolution() const {
    return source_resolution_;
  }
@ -277,10 +281,7 @@ class Tesseract : public Wordrec {
  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
                                              const char *s,
                                              const char *lengths);
-  void match_word_pass2(                 //recog one word
+  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
                        WERD_RES *word,  //word to do
                        ROW *row,
                        BLOCK* block);
  void classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word);
  void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
                          WERD_RES* word, WERD_RES* new_word);
@ -288,12 +289,38 @@ class Tesseract : public Wordrec {
  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
  BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
-  void set_word_fonts(
+  // Set fonts of this word.
-      WERD_RES *word,  // set fonts of this word
+  void set_word_fonts(WERD_RES *word);
      BLOB_CHOICE_LIST_CLIST *blob_choices);  // detailed results
  void font_recognition_pass(PAGE_RES* page_res);
  BOOL8 check_debug_pt(WERD_RES *word, int location);
  //// superscript.cpp ////////////////////////////////////////////////////
  bool SubAndSuperscriptFix(WERD_RES *word_res);
  void GetSubAndSuperscriptCandidates(const WERD_RES *word,
                                      int *num_rebuilt_leading,
                                      ScriptPos *leading_pos,
                                      float *leading_certainty,
                                      int *num_rebuilt_trailing,
                                      ScriptPos *trailing_pos,
                                      float *trailing_certainty,
                                      float *avg_certainty,
                                      float *unlikely_threshold);
  WERD_RES *TrySuperscriptSplits(int num_chopped_leading,
                                 float leading_certainty,
                                 ScriptPos leading_pos,
                                 int num_chopped_trailing,
                                 float trailing_certainty,
                                 ScriptPos trailing_pos,
                                 WERD_RES *word,
                                 bool *is_good,
                                 int *retry_leading,
                                 int *retry_trailing);
  bool BelievableSuperscript(bool debug,
                             const WERD_RES &word,
                             float certainty_threshold,
                             int *left_ok,
                             int *right_ok) const;
  //// cube_control.cpp ///////////////////////////////////////////////////
  bool init_cube_objects(bool load_combiner,
                         TessdataManager *tessdata_manager);
@ -314,7 +341,6 @@ class Tesseract : public Wordrec {
  // Sets up a fake result  and returns false if something goes wrong.
  bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word);
  void fill_werd_res(const BoxWord& cube_box_word,
                     WERD_CHOICE* cube_werd_choice,
                     const char* cube_best_str,
                     WERD_RES* tess_werd_res);
  bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
@ -429,13 +455,10 @@ class Tesseract : public Wordrec {
  // #ifndef GRAPHICS_DISABLED
  BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res);
  // #endif  // GRAPHICS_DISABLED
  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
  //// reject.h //////////////////////////////////////////////////////////
-  void make_reject_map(            //make rej map for wd //detailed results
+  // make rej map for word
-                       WERD_RES *word,
+  void make_reject_map(WERD_RES *word, ROW *row, inT16 pass);
                       BLOB_CHOICE_LIST_CLIST *blob_choices,
                       ROW *row,
                       inT16 pass  //1st or 2nd?
                      );
  BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
  inT16 first_alphanum_index(const char *word,
                             const char *word_lengths);
@ -456,8 +479,6 @@ class Tesseract : public Wordrec {
                     WERD_RES *word,
                     ROW *row);
  void nn_recover_rejects(WERD_RES *word, ROW *row);
  BOOL8 test_ambig_word(  //test for ambiguity
                        WERD_RES *word);
  void set_done(  //set done flag
                WERD_RES *word,
                inT16 pass);
@ -472,12 +493,16 @@ class Tesseract : public Wordrec {
                       uinT16 mode);
  //// tfacepp.cpp ///////////////////////////////////////////////////////
-  void recog_word_recursive(WERD_RES* word,
+  void recog_word_recursive(WERD_RES* word);
-                            BLOB_CHOICE_LIST_CLIST *blob_choices);
+  void recog_word(WERD_RES *word);
-  void recog_word(WERD_RES *word,
+  void split_and_recog_word(WERD_RES* word);
-                  BLOB_CHOICE_LIST_CLIST *blob_choices);
+  void split_word(WERD_RES *word,
-  void split_and_recog_word(WERD_RES* word,
+                  int split_pt,
-                            BLOB_CHOICE_LIST_CLIST *blob_choices);
+                  WERD_RES **right_piece,
                  BlamerBundle **orig_blamer_bundle) const;
  void join_words(WERD_RES *word,
                  WERD_RES *word2,
                  BlamerBundle *orig_bb) const;
  //// fixspace.cpp ///////////////////////////////////////////////////////
  BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
  inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
@ -495,7 +520,6 @@ class Tesseract : public Wordrec {
                        PAGE_RES *page_res);
  void dump_words(WERD_RES_LIST &perm, inT16 score,
                  inT16 mode, BOOL8 improved);
  BOOL8 uniformly_spaced(WERD_RES *word);
  BOOL8 fixspace_thinks_word_done(WERD_RES *word);
  inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
  float blob_noise_score(TBLOB *blob);
@ -538,14 +562,9 @@ class Tesseract : public Wordrec {
  void tess_add_doc_word(                          //test acceptability
                         WERD_CHOICE *word_choice  //after context
                        );
-  void tess_segment_pass1(WERD_RES *word,
+  void tess_segment_pass_n(int pass_n, WERD_RES *word);
-                          BLOB_CHOICE_LIST_CLIST *blob_choices);
+  bool tess_acceptable_word(WERD_RES *word);
-  void tess_segment_pass2(WERD_RES *word,
+
                          BLOB_CHOICE_LIST_CLIST *blob_choices);
  BOOL8 tess_acceptable_word(                           //test acceptability
                             WERD_CHOICE *word_choice,  //after context
                             WERD_CHOICE *raw_choice    //before context
                            );
  //// applybox.cpp //////////////////////////////////////////////////////
  // Applies the box file based on the image name fname, and resegments
  // the words in the block_list (page), with:
@ -649,7 +668,7 @@ class Tesseract : public Wordrec {
  // estimate.
  float ComputeCompatibleXheight(WERD_RES *word_res);
  //// Data members ///////////////////////////////////////////////////////
-  // TODO(ocr-team): Remove obsolete parameters.
+  // TODO(ocr-team): Find and remove obsolete parameters.
  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
             "Take segmentation and labeling from box file");
  BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
@ -684,9 +703,6 @@ class Tesseract : public Wordrec {
            "documents while performing ocr.");
  STRING_VAR_H(tessedit_write_params_to_file, "",
               "Write all parameters to the given file.");
  BOOL_VAR_H(tessedit_adapt_to_char_fragments, true,
             "Adapt to words that contain "
             " a character composed form fragments");
  BOOL_VAR_H(tessedit_adaption_debug, false,
             "Generate and print debug information for adaption");
  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
@ -706,6 +722,7 @@ class Tesseract : public Wordrec {
  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
  BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
  BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
             "Try to improve fuzzy spaces");
  BOOL_VAR_H(tessedit_unrej_any_wd, false,
@ -716,7 +733,7 @@ class Tesseract : public Wordrec {
             "Add words to the document dictionary");
  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
-  BOOL_VAR_H(tessedit_enable_bigram_correction, false,
+  BOOL_VAR_H(tessedit_enable_bigram_correction, true,
             "Enable correction based on the word bigram dictionary.");
  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
            "correction.");
@ -731,8 +748,6 @@ class Tesseract : public Wordrec {
               "good_quality_doc lte outline error limit");
  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
  BOOL_VAR_H(tessedit_tess_adapt_to_rejmap, false,
             "Use reject map to control Tesseract adaption");
  INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
            "Adaptation decision algorithm for tess");
  BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
@ -741,13 +756,13 @@ class Tesseract : public Wordrec {
  BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
  INT_VAR_H(tessedit_test_adaption_mode, 3,
            "Adaptation decision algorithm for tess");
  BOOL_VAR_H(save_blob_choices, false,
             "Save the results of the recognition step"
             " (blob_choices) within the corresponding WERD_CHOICE");
  BOOL_VAR_H(test_pt, false, "Test for point");
  double_VAR_H(test_pt_x, 99999.99, "xcoord");
  double_VAR_H(test_pt_y, 99999.99, "ycoord");
  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
  BOOL_VAR_H(paragraph_text_based, true,
             "Run paragraph detection on the post-text-recognition "
             "(more accurate)");
  INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
@ -832,6 +847,26 @@ class Tesseract : public Wordrec {
  INT_VAR_H(x_ht_acceptance_tolerance, 8,
            "Max allowed deviation of blob top outside of font data");
  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
  INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
  double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse "
               "certainty does a superscript position glyph need to be for us "
               "to try classifying it as a char with a different baseline?");
  double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in "
               "badness do we think sufficient to choose a superscript over "
               "what we'd thought.  For example, a value of 0.6 means we want "
               "to reduce badness of certainty by 40%");
  double_VAR_H(superscript_scaledown_ratio, 0.4,
               "A superscript scaled down more than this is unbelievably "
               "small.  For example, 0.3 means we expect the font size to "
               "be no smaller than 30% of the text line font size.");
  double_VAR_H(subscript_max_y_top, 0.5,
               "Maximum top of a character measured as a multiple of x-height "
               "above the baseline for us to reconsider whether it's a "
               "subscript.");
  double_VAR_H(superscript_min_y_bottom, 0.3,
              "Minimum bottom of a character measured as a multiple of "
              "x-height above the baseline for us to reconsider whether it's "
              "a superscript.");
  BOOL_VAR_H(tessedit_write_block_separators, false,
             "Write block separators in output");
  BOOL_VAR_H(tessedit_write_rep_codes, false,
@ -856,7 +891,6 @@ class Tesseract : public Wordrec {
             "Dont reject ANYTHING AT ALL");
  BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
  INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
  double_VAR_H(tessedit_lower_flip_hyphen, 1.5,
@ -888,17 +922,81 @@ class Tesseract : public Wordrec {
            "Debug level for TessdataManager functions.");
  STRING_VAR_H(tessedit_load_sublangs, "",
               "List of languages to load with this one");
  BOOL_VAR_H(tessedit_use_primary_params_model, false,
             "In multilingual mode use params model of the primary language");
  // Min acceptable orientation margin (difference in scores between top and 2nd
  // choice in OSResults::orientations) to believe the page orientation.
  double_VAR_H(min_orientation_margin, 7.0,
               "Min acceptable orientation margin");
  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
  BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
  BOOL_VAR_H(poly_allow_detailed_fx, false,
             "Allow feature extractors to see the original outline");
  BOOL_VAR_H(tessedit_init_config_only, false,
             "Only initialize with the config file. Useful if the instance is "
             "not going to be used for OCR but say only for layout analysis.");
  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
  // The following parameters were deprecated and removed from their original
  // locations. The parameters are temporarily kept here to give Tesseract
  // users a chance to updated their [lang].traineddata and config files
  // without introducing failures during Tesseract initialization.
  // TODO(ocr-team): remove these parameters from the code once we are
  // reasonably sure that Tesseract users have updated their data files.
  //
  // BEGIN DEPRECATED PARAMETERS
  INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
  BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
             " dawgs (e.g. for non-space delimited languages)");
  INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
  BOOL_VAR_H(permute_debug, 0, "char permutation debug");
  double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
               " current best rate to prune other hypotheses");
  BOOL_VAR_H(permute_script_word, 0,
             "Turn on word script consistency permuter");
  BOOL_VAR_H(segment_segcost_rating, 0,
             "incorporate segmentation cost in word rating?");
  double_VAR_H(segment_reward_script, 0.95,
               "Score multipler for script consistency within a word. "
               "Being a 'reward' factor, it should be <= 1. "
               "Smaller value implies bigger reward.");
  BOOL_VAR_H(permute_fixed_length_dawg, 0,
             "Turn on fixed-length phrasebook search permuter");
  BOOL_VAR_H(permute_chartype_word, 0,
             "Turn on character type (property) consistency permuter");
  double_VAR_H(segment_reward_chartype, 0.97,
               "Score multipler for char type consistency within a word. ");
  double_VAR_H(segment_reward_ngram_best_choice, 0.99,
               "Score multipler for ngram permuter's best choice"
               " (only used in the Han script path).");
  BOOL_VAR_H(ngram_permuter_activated, false,
             "Activate character-level n-gram-based permuter");
  BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
  INT_VAR_H(language_model_fixed_length_choices_depth, 3,
            "Depth of blob choice lists to explore"
            " when fixed length dawgs are on");
  BOOL_VAR_H(use_new_state_cost, FALSE,
             "use new state cost heuristics for segmentation state evaluation");
  double_VAR_H(heuristic_segcost_rating_base, 1.25,
               "base factor for adding segmentation cost into word rating."
               "It's a multiplying factor, the larger the value above 1, "
               "the bigger the effect of segmentation cost.");
  double_VAR_H(heuristic_weight_rating, 1,
               "weight associated with char rating in combined cost of state");
  double_VAR_H(heuristic_weight_width, 1000.0,
               "weight associated with width evidence in combined cost of"
               " state");
  double_VAR_H(heuristic_weight_seamcut, 0,
               "weight associated with seam cut in combined cost of state");
  double_VAR_H(heuristic_max_char_wh_ratio, 2.0,
               "max char width-to-height ratio allowed in segmentation");
  BOOL_VAR_H(enable_new_segsearch, false,
             "Enable new segmentation search path.");
  double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
               "Maximum character width-to-height ratio for"
               "fixed pitch fonts");
  // END DEPRECATED PARAMETERS
  //// ambigsrecog.cpp /////////////////////////////////////////////////////////
  FILE *init_recog_training(const STRING &fname);
  void recog_training_segmented(const STRING &fname,
@ -927,6 +1025,8 @@ class Tesseract : public Wordrec {
  Pix* cube_binary_;
  // Grey-level input image if the input was not binary, otherwise NULL.
  Pix* pix_grey_;
  // Thresholds that were used to generate the thresholded image from grey.
  Pix* pix_thresholds_;
  // Input image resolution after any scaling. The resolution is not well
  // transmitted by operations on Pix, so we keep an independent record here.
  int source_resolution_;