Added simultaneous multi-language capability, Refactored top-level word recognition module, Blamer module added for error analysis, Tidied up constraints on control parameters, Added UNICHARSET to WERD_CHOICE to make mult-language handling easier, Added word bigram correction

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@655 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-01-18 06:30:14 +08:00 · 2012-02-02 03:06:39 +00:00 · 2012-02-02 03:06:39 +00:00 · 9206e92b0d
commit 9206e92b0d
parent 73adf693d5
45 changed files with 3069 additions and 407 deletions
--- a/ccstruct/Makefile.am
+++ b/ccstruct/Makefile.am
@ -4,11 +4,12 @@ AM_CPPFLAGS = \
    -I$(top_srcdir)/image -I$(top_srcdir)/viewer

 include_HEADERS = \
-    blckerr.h blobbox.h blobs.h blread.h boxword.h ccstruct.h coutln.h crakedge.h \
-    detlinefit.h dppoint.h genblob.h hpddef.h hpdsizes.h ipoints.h \
+    blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
+    detlinefit.h dppoint.h fontinfo.h genblob.h hpddef.h hpdsizes.h ipoints.h \
    linlsq.h matrix.h mod128.h normalis.h \
-    ocrblock.h ocrrow.h otsuthr.h \
-    pageres.h pdblock.h points.h polyaprx.h polyblk.h \
+    ocrblock.h ocrpara.h ocrrow.h otsuthr.h \
+    pageres.h params_training_featdef.h \
+    pdblock.h points.h polyaprx.h polyblk.h \
    publictypes.h \
    quadlsq.h quadratc.h quspline.h ratngs.h rect.h rejctmap.h \
    seam.h split.h statistc.h stepblob.h vecfuncs.h werd.h
@ -26,10 +27,10 @@ libtesseract_ccstruct_la_LIBADD = \
 endif

 libtesseract_ccstruct_la_SOURCES = \
-    blobbox.cpp blobs.cpp blread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
-    detlinefit.cpp dppoint.cpp genblob.cpp \
+    blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
+    detlinefit.cpp dppoint.cpp fontinfo.cpp genblob.cpp \
    linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \
-    ocrblock.cpp ocrrow.cpp otsuthr.cpp \
+    ocrblock.cpp ocrpara.cpp ocrrow.cpp otsuthr.cpp \
    pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \
    publictypes.cpp \
    quadlsq.cpp quadratc.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \
--- a/ccstruct/blobbox.cpp
+++ b/ccstruct/blobbox.cpp
@ -32,14 +32,26 @@ const double kCosSmallAngle = 0.866;
 const double kDefiniteAspectRatio = 2.0;
 // Multiple of short length in perimeter to make a joined word.
 const double kComplexShapePerimeterRatio = 1.5;
+// Min multiple of linesize for medium-sized blobs in ReFilterBlobs.
+const double kMinMediumSizeRatio = 0.25;
+// Max multiple of linesize for medium-sized blobs in ReFilterBlobs.
+const double kMaxMediumSizeRatio = 4.0;

+// Rotates the box and the underlying blob.
 void BLOBNBOX::rotate(FCOORD rotation) {
  cblob_ptr->rotate(rotation);
  rotate_box(rotation);
  compute_bounding_box();
 }

-// Rotate the box by the angle given by rotation.
+// Reflect the box in the y-axis, leaving the underlying blob untouched.
+void BLOBNBOX::reflect_box_in_y_axis() {
+  int left = -box.right();
+  box.set_right(-box.left());
+  box.set_left(left);
+}
+
+// Rotates the box by the angle given by rotation.
 // If the blob is a diacritic, then only small rotations for skew
 // correction can be applied.
 void BLOBNBOX::rotate_box(FCOORD rotation) {
@ -57,6 +69,7 @@ void BLOBNBOX::rotate_box(FCOORD rotation) {
    set_diacritic_box(box);
  }
 }
+
 /**********************************************************************
 * BLOBNBOX::merge
 *
@ -183,6 +196,17 @@ void BLOBNBOX::MinMaxGapsClipped(int* h_min, int* h_max,
  if (*v_max > max_dimension && *v_min < max_dimension) *v_max = *v_min;
 }

+// NULLs out any neighbours that are DeletableNoise to remove references.
+void BLOBNBOX::CleanNeighbours() {
+  for (int dir = 0; dir < BND_COUNT; ++dir) {
+    BLOBNBOX* neighbour = neighbours_[dir];
+    if (neighbour != NULL && neighbour->DeletableNoise()) {
+      neighbours_[dir] = NULL;
+      good_stroke_neighbours_[dir] = false;
+    }
+  }
+}
+
 // Returns positive if there is at least one side neighbour that has a similar
 // stroke width and is not on the other side of a rule line.
 int BLOBNBOX::GoodTextBlob() const {
@ -195,6 +219,18 @@ int BLOBNBOX::GoodTextBlob() const {
  return score;
 }

+// Returns the number of side neighbours that are of type BRT_NOISE.
+int BLOBNBOX::NoisyNeighbours() const {
+  int count = 0;
+  for (int dir = 0; dir < BND_COUNT; ++dir) {
+    BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
+    BLOBNBOX* blob = neighbour(bnd);
+    if (blob != NULL && blob->region_type() == BRT_NOISE)
+      ++count;
+  }
+  return count;
+}
+
 // Returns true, and sets vert_possible/horz_possible if the blob has some
 // feature that makes it individually appear to flow one way.
 // eg if it has a high aspect ratio, yet has a complex shape, such as a
@ -281,7 +317,8 @@ bool BLOBNBOX::MatchingStrokeWidth(const BLOBNBOX& other,
 // given horizontal range.
 TBOX BLOBNBOX::BoundsWithinLimits(int left, int right) {
  FCOORD no_rotation(1.0f, 0.0f);
-  float top, bottom;
+  float top = box.top();
+  float bottom = box.bottom();
  if (cblob_ptr != NULL) {
    find_cblob_limits(cblob_ptr, static_cast<float>(left),
                      static_cast<float>(right), no_rotation,
@ -300,7 +337,54 @@ TBOX BLOBNBOX::BoundsWithinLimits(int left, int right) {
  return shrunken_box;
 }

+// Helper to call CleanNeighbours on all blobs on the list.
+void BLOBNBOX::CleanNeighbours(BLOBNBOX_LIST* blobs) {
+  BLOBNBOX_IT blob_it(blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    blob_it.data()->CleanNeighbours();
+  }
+}
+
+// Helper to delete all the deletable blobs on the list.
+void BLOBNBOX::DeleteNoiseBlobs(BLOBNBOX_LIST* blobs) {
+  BLOBNBOX_IT blob_it(blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.data();
+    if (blob->DeletableNoise()) {
+      delete blob->cblob();
+      delete blob_it.extract();
+    }
+  }
+}
+
 #ifndef GRAPHICS_DISABLED
+// Helper to draw all the blobs on the list in the given body_colour,
+// with child outlines in the child_colour.
+void BLOBNBOX::PlotBlobs(BLOBNBOX_LIST* list,
+                         ScrollView::Color body_colour,
+                         ScrollView::Color child_colour,
+                         ScrollView* win) {
+  BLOBNBOX_IT it(list);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    it.data()->plot(win, body_colour, child_colour);
+  }
+}
+
+// Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
+// given list in the given body_colour, with child outlines in the
+// child_colour.
+void BLOBNBOX::PlotNoiseBlobs(BLOBNBOX_LIST* list,
+                              ScrollView::Color body_colour,
+                              ScrollView::Color child_colour,
+                              ScrollView* win) {
+  BLOBNBOX_IT it(list);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    BLOBNBOX* blob = it.data();
+    if (blob->DeletableNoise())
+      blob->plot(win, body_colour, child_colour);
+  }
+}
+
 ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type,
                                          BlobTextFlowType flow_type) {
  switch (region_type) {
@ -329,6 +413,8 @@ ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type,
        return ScrollView::MEDIUM_BLUE;
      if (flow_type == BTFT_LEADER)
        return ScrollView::WHEAT;
+      if (flow_type == BTFT_NONTEXT)
+        return ScrollView::PINK;
      return ScrollView::MAGENTA;
    default:
      return ScrollView::GREY;
@ -678,6 +764,7 @@ void TO_ROW::clear() {
  spacing = 0.0;
  xheight = 0.0;
  xheight_evidence = 0;
+  body_size = 0.0;
  ascrise = 0.0;
  descdrop = 0.0;
  min_space = 0;
@ -813,15 +900,97 @@ TO_BLOCK::~TO_BLOCK() {
  clear_blobnboxes(&large_blobs);
 }

+// Helper function to divide the input blobs over noise, small, medium
+// and large lists. Blobs small in height and (small in width or large in width)
+// go in the noise list. Dash (-) candidates go in the small list, and
+// medium and large are by height.
+// SIDE-EFFECT: reset all blobs to initial state by calling Init().
+static void SizeFilterBlobs(int min_height, int max_height,
+                            BLOBNBOX_LIST* src_list,
+                            BLOBNBOX_LIST* noise_list,
+                            BLOBNBOX_LIST* small_list,
+                            BLOBNBOX_LIST* medium_list,
+                            BLOBNBOX_LIST* large_list) {
+  BLOBNBOX_IT noise_it(noise_list);
+  BLOBNBOX_IT small_it(small_list);
+  BLOBNBOX_IT medium_it(medium_list);
+  BLOBNBOX_IT large_it(large_list);
+  for (BLOBNBOX_IT src_it(src_list); !src_it.empty(); src_it.forward()) {
+    BLOBNBOX* blob = src_it.extract();
+    blob->ReInit();
+    int width = blob->bounding_box().width();
+    int height = blob->bounding_box().height();
+    if (height < min_height  &&
+        (width < min_height || width > max_height))
+      noise_it.add_after_then_move(blob);
+    else if (height > max_height)
+      large_it.add_after_then_move(blob);
+    else if (height < min_height)
+      small_it.add_after_then_move(blob);
+    else
+      medium_it.add_after_then_move(blob);
+  }
+}
+
+// Reorganize the blob lists with a different definition of small, medium
+// and large, compared to the original definition.
+// Height is still the primary filter key, but medium width blobs of small
+// height become small, and very wide blobs of small height stay noise, along
+// with small dot-shaped blobs.
+void TO_BLOCK::ReSetAndReFilterBlobs() {
+  int min_height = IntCastRounded(kMinMediumSizeRatio * line_size);
+  int max_height = IntCastRounded(kMaxMediumSizeRatio * line_size);
+  BLOBNBOX_LIST noise_list;
+  BLOBNBOX_LIST small_list;
+  BLOBNBOX_LIST medium_list;
+  BLOBNBOX_LIST large_list;
+  SizeFilterBlobs(min_height, max_height, &blobs,
+                  &noise_list, &small_list, &medium_list, &large_list);
+  SizeFilterBlobs(min_height, max_height, &large_blobs,
+                  &noise_list, &small_list, &medium_list, &large_list);
+  SizeFilterBlobs(min_height, max_height, &small_blobs,
+                  &noise_list, &small_list, &medium_list, &large_list);
+  SizeFilterBlobs(min_height, max_height, &noise_blobs,
+                  &noise_list, &small_list, &medium_list, &large_list);
+  BLOBNBOX_IT blob_it(&blobs);
+  blob_it.add_list_after(&medium_list);
+  blob_it.set_to_list(&large_blobs);
+  blob_it.add_list_after(&large_list);
+  blob_it.set_to_list(&small_blobs);
+  blob_it.add_list_after(&small_list);
+  blob_it.set_to_list(&noise_blobs);
+  blob_it.add_list_after(&noise_list);
+}
+
+// Deletes noise blobs from all lists where not owned by a ColPartition.
+void TO_BLOCK::DeleteUnownedNoise() {
+  BLOBNBOX::CleanNeighbours(&blobs);
+  BLOBNBOX::CleanNeighbours(&small_blobs);
+  BLOBNBOX::CleanNeighbours(&noise_blobs);
+  BLOBNBOX::CleanNeighbours(&large_blobs);
+  BLOBNBOX::DeleteNoiseBlobs(&blobs);
+  BLOBNBOX::DeleteNoiseBlobs(&small_blobs);
+  BLOBNBOX::DeleteNoiseBlobs(&noise_blobs);
+  BLOBNBOX::DeleteNoiseBlobs(&large_blobs);
+}
+
 #ifndef GRAPHICS_DISABLED
+// Draw the noise blobs from all lists in red.
+void TO_BLOCK::plot_noise_blobs(ScrollView* win) {
+  BLOBNBOX::PlotNoiseBlobs(&noise_blobs, ScrollView::RED, ScrollView::RED, win);
+  BLOBNBOX::PlotNoiseBlobs(&small_blobs, ScrollView::RED, ScrollView::RED, win);
+  BLOBNBOX::PlotNoiseBlobs(&large_blobs, ScrollView::RED, ScrollView::RED, win);
+  BLOBNBOX::PlotNoiseBlobs(&blobs, ScrollView::RED, ScrollView::RED, win);
+}
+
 // Draw the blobs on the various lists in the block in different colors.
-void TO_BLOCK::plot_graded_blobs(ScrollView* to_win) {
-  plot_blob_list(to_win, &noise_blobs, ScrollView::CORAL, ScrollView::BLUE);
-  plot_blob_list(to_win, &small_blobs,
-                 ScrollView::GOLDENROD, ScrollView::YELLOW);
-  plot_blob_list(to_win, &large_blobs,
-                 ScrollView::DARK_GREEN, ScrollView::YELLOW);
-  plot_blob_list(to_win, &blobs, ScrollView::WHITE, ScrollView::BROWN);
+void TO_BLOCK::plot_graded_blobs(ScrollView* win) {
+  BLOBNBOX::PlotBlobs(&noise_blobs, ScrollView::CORAL, ScrollView::BLUE, win);
+  BLOBNBOX::PlotBlobs(&small_blobs, ScrollView::GOLDENROD, ScrollView::YELLOW,
+                      win);
+  BLOBNBOX::PlotBlobs(&large_blobs, ScrollView::DARK_GREEN, ScrollView::YELLOW,
+                      win);
+  BLOBNBOX::PlotBlobs(&blobs, ScrollView::WHITE, ScrollView::BROWN, win);
 }

 /**********************************************************************
--- a/ccstruct/blobbox.h
+++ b/ccstruct/blobbox.h
@ -28,9 +28,9 @@

 enum PITCH_TYPE
 {
-  PITCH_DUNNO,                   //insufficient data
-  PITCH_DEF_FIXED,               //definitely fixed
-  PITCH_MAYBE_FIXED,             //could be
+  PITCH_DUNNO,                   // insufficient data
+  PITCH_DEF_FIXED,               // definitely fixed
+  PITCH_MAYBE_FIXED,             // could be
  PITCH_DEF_PROP,
  PITCH_MAYBE_PROP,
  PITCH_CORR_FIXED,
@ -38,13 +38,16 @@ enum PITCH_TYPE
 };

 // The possible tab-stop types of each side of a BLOBNBOX.
+// The ordering is important, as it is used for deleting dead-ends in the
+// search. ALIGNED, CONFIRMED and VLINE should remain greater than the
+// non-aligned, unset, or deleted members.
 enum TabType {
-  TT_NONE,         // Not a tab.
-  TT_DELETED,      // Not a tab after detailed analysis.
-  TT_UNCONFIRMED,  // Initial designation of a tab-stop candidate.
-  TT_FAKE,         // Added by interpolation.
-  TT_CONFIRMED,    // Aligned with neighbours.
-  TT_VLINE         // Detected as a vertical line.
+  TT_NONE,           // Not a tab.
+  TT_DELETED,        // Not a tab after detailed analysis.
+  TT_MAYBE_RAGGED,   // Initial designation of a tab-stop candidate.
+  TT_MAYBE_ALIGNED,  // Initial designation of a tab-stop candidate.
+  TT_CONFIRMED,      // Aligned with neighbours.
+  TT_VLINE           // Detected as a vertical line.
 };

 // The possible region types of a BLOBNBOX.
@ -65,6 +68,7 @@ enum BlobRegionType {
 };

 // enum for elements of arrays that refer to neighbours.
+// NOTE: keep in this order, so ^2 can be used to flip direction.
 enum BlobNeighbourDir {
  BND_LEFT,
  BND_BELOW,
@ -73,6 +77,21 @@ enum BlobNeighbourDir {
  BND_COUNT
 };

+// enum for special type of text characters, such as math symbol or italic.
+enum BlobSpecialTextType {
+  BSTT_NONE,  // No special.
+  BSTT_ITALIC,  // Italic style.
+  BSTT_DIGIT,  // Digit symbols.
+  BSTT_MATH,  // Mathmatical symobls (not including digit).
+  BSTT_UNCLEAR,  // Characters with low recognition rate.
+  BSTT_SKIP,  // Characters that we skip labeling (usually too small).
+  BSTT_COUNT
+};
+
+inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
+  return static_cast<BlobNeighbourDir>(dir ^ 2);
+}
+
 // BlobTextFlowType indicates the quality of neighbouring information
 // related to a chain of connected components, either horizontally or
 // vertically. Also used by ColPartition for the collection of blobs
@ -89,14 +108,10 @@ enum BlobTextFlowType {
 };

 // Returns true if type1 dominates type2 in a merge. Mostly determined by the
-// ordering of the enum, but NONTEXT dominates everything else, and LEADER
-// dominates nothing.
+// ordering of the enum, LEADER is weak and dominates nothing.
 // The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
 // this cannot be true if t1 == t2, so the result is undefined.
 inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
-  // NONTEXT dominates everything.
-  if (type1 == BTFT_NONTEXT) return true;
-  if (type2 == BTFT_NONTEXT) return false;
  // LEADER always loses.
  if (type1 == BTFT_LEADER) return false;
  if (type2 == BTFT_LEADER) return true;
@ -127,8 +142,17 @@ class BLOBNBOX:public ELIST_LINK
      return new BLOBNBOX(blob);
    }

-    void rotate_box(FCOORD rotation);
+    // Rotates the box and the underlying blob.
    void rotate(FCOORD rotation);
+
+    // Methods that act on the box without touching the underlying blob.
+    // Reflect the box in the y-axis, leaving the underlying blob untouched.
+    void reflect_box_in_y_axis();
+    // Rotates the box by the angle given by rotation.
+    // If the blob is a diacritic, then only small rotations for skew
+    // correction can be applied.
+    void rotate_box(FCOORD rotation);
+    // Moves just the box by the given vector.
    void translate_box(ICOORD v) {
      if (IsDiacritic()) {
        box.move(v);
@ -150,7 +174,17 @@ class BLOBNBOX:public ELIST_LINK
    void NeighbourGaps(int gaps[BND_COUNT]) const;
    void MinMaxGapsClipped(int* h_min, int* h_max,
                           int* v_min, int* v_max) const;
+    void CleanNeighbours();
+    // Returns positive if there is at least one side neighbour that has a
+    // similar stroke width and is not on the other side of a rule line.
    int GoodTextBlob() const;
+    // Returns the number of side neighbours that are of type BRT_NOISE.
+    int NoisyNeighbours() const;
+
+    // Returns true if the blob is noise and has no owner.
+    bool DeletableNoise() const {
+      return owner() == NULL && region_type() == BRT_NOISE;
+    }

    // Returns true, and sets vert_possible/horz_possible if the blob has some
    // feature that makes it individually appear to flow one way.
@ -229,6 +263,12 @@ class BLOBNBOX:public ELIST_LINK
    void set_region_type(BlobRegionType new_type) {
      region_type_ = new_type;
    }
+    BlobSpecialTextType special_text_type() const {
+      return spt_type_;
+    }
+    void set_special_text_type(BlobSpecialTextType new_type) {
+      spt_type_ = new_type;
+    }
    BlobTextFlowType flow() const {
      return flow_;
    }
@ -323,10 +363,23 @@ class BLOBNBOX:public ELIST_LINK
    int base_char_bottom() const {
      return base_char_bottom_;
    }
+    int line_crossings() const {
+      return line_crossings_;
+    }
+    void set_line_crossings(int value) {
+      line_crossings_ = value;
+    }
    void set_diacritic_box(const TBOX& diacritic_box) {
      base_char_top_ = diacritic_box.top();
      base_char_bottom_ = diacritic_box.bottom();
    }
+    BLOBNBOX* base_char_blob() const {
+      return base_char_blob_;
+    }
+    void set_base_char_blob(BLOBNBOX* blob) {
+      base_char_blob_ = blob;
+    }
+
    bool UniquelyVertical() const {
      return vert_possible_ && !horz_possible_;
    }
@ -350,11 +403,29 @@ class BLOBNBOX:public ELIST_LINK
    static bool UnMergeableType(BlobRegionType type) {
      return IsLineType(type) || IsImageType(type);
    }
+    // Helper to call CleanNeighbours on all blobs on the list.
+    static void CleanNeighbours(BLOBNBOX_LIST* blobs);
+    // Helper to delete all the deletable blobs on the list.
+    static void DeleteNoiseBlobs(BLOBNBOX_LIST* blobs);
+
+#ifndef GRAPHICS_DISABLED
+    // Helper to draw all the blobs on the list in the given body_colour,
+    // with child outlines in the child_colour.
+    static void PlotBlobs(BLOBNBOX_LIST* list,
+                          ScrollView::Color body_colour,
+                          ScrollView::Color child_colour,
+                          ScrollView* win);
+    // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
+    // given list in the given body_colour, with child outlines in the
+    // child_colour.
+    static void PlotNoiseBlobs(BLOBNBOX_LIST* list,
+                               ScrollView::Color body_colour,
+                               ScrollView::Color child_colour,
+                               ScrollView* win);

    static ScrollView::Color TextlineColor(BlobRegionType region_type,
                                           BlobTextFlowType flow_type);

-#ifndef GRAPHICS_DISABLED
    // Keep in sync with BlobRegionType.
    ScrollView::Color BoxColor() const;

@ -386,6 +457,7 @@ class BLOBNBOX:public ELIST_LINK
    right_tab_type_ = TT_NONE;
    region_type_ = BRT_UNKNOWN;
    flow_ = BTFT_NONE;
+    spt_type_ = BSTT_SKIP;
    left_rule_ = 0;
    right_rule_ = 0;
    left_crossing_rule_ = 0;
@ -395,6 +467,8 @@ class BLOBNBOX:public ELIST_LINK
    owner_ = NULL;
    base_char_top_ = box.top();
    base_char_bottom_ = box.bottom();
+    line_crossings_ = 0;
+    base_char_blob_ = NULL;
    horz_possible_ = false;
    vert_possible_ = false;
    leader_on_left_ = false;
@ -427,10 +501,13 @@ class BLOBNBOX:public ELIST_LINK
  inT16 right_crossing_rule_;   // x-coord of nearest or crossing rule line
  inT16 base_char_top_;         // y-coord of top/bottom of diacritic base,
  inT16 base_char_bottom_;      // if it exists else top/bottom of this blob.
+  int line_crossings_;          // Number of line intersections touched.
+  BLOBNBOX* base_char_blob_;    // The blob that was the base char.
  float horz_stroke_width_;     // Median horizontal stroke width
  float vert_stroke_width_;     // Median vertical stroke width
  float area_stroke_width_;     // Stroke width from area/perimeter ratio.
  tesseract::ColPartition* owner_;  // Who will delete me when I am not needed
+  BlobSpecialTextType spt_type_;   // Special text type.
  BLOBNBOX* neighbours_[BND_COUNT];
  bool good_stroke_neighbours_[BND_COUNT];
  bool horz_possible_;           // Could be part of horizontal flow.
@ -556,6 +633,8 @@ class TO_ROW: public ELIST2_LINK
    int xheight_evidence;        // number of blobs of height xheight
    float ascrise;               // ascenders
    float descdrop;              // descenders
+    float body_size;             // of CJK characters.  Assumed to be
+                                 // xheight+ascrise for non-CJK text.
    inT32 min_space;             // min size for real space
    inT32 max_nonspace;          // max size of non-space
    inT32 space_threshold;       // space vs nonspace
@ -640,8 +719,19 @@ class TO_BLOCK:public ELIST_LINK
      }
    }

-    // Draw the blobs on on the various lists in the block in different colors.
+    // Reorganizes the blob lists with a different definition of small, medium
+    // and large, compared to the original definition.
+    // Height is still the primary filter key, but medium width blobs of small
+    // height become medium, and very wide blobs of small height stay small.
+    void ReSetAndReFilterBlobs();
+
+    // Deletes noise blobs from all lists where not owned by a ColPartition.
+    void DeleteUnownedNoise();
+
 #ifndef GRAPHICS_DISABLED
+    // Draw the noise blobs from all lists in red.
+    void plot_noise_blobs(ScrollView* to_win);
+    // Draw the blobs on on the various lists in the block in different colors.
    void plot_graded_blobs(ScrollView* to_win);
 #endif

--- a/ccstruct/blobs.cpp
+++ b/ccstruct/blobs.cpp
@ -29,6 +29,7 @@
 #include "mfcpch.h"
 #include "blobs.h"
 #include "ccstruct.h"
+#include "clst.h"
 #include "cutil.h"
 #include "emalloc.h"
 #include "helpers.h"
@ -46,15 +47,18 @@ using tesseract::CCStruct;
 // A Vector representing the "vertical" direction when measuring the
 // divisiblity of blobs into multiple blobs just by separating outlines.
 // See divisible_blob below for the use.
-const TPOINT kDivisibleVerticalUpright = {0, 1};
+const TPOINT kDivisibleVerticalUpright(0, 1);
 // A vector representing the "vertical" direction for italic text for use
 // when separating outlines. Using it actually deteriorates final accuracy,
 // so it is only used for ApplyBoxes chopping to get a better segmentation.
-const TPOINT kDivisibleVerticalItalic = {1, 5};
+const TPOINT kDivisibleVerticalItalic(1, 5);

 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
+
+CLISTIZE(EDGEPT);
+
 // Consume the circular list of EDGEPTs to make a TESSLINE.
 TESSLINE* TESSLINE::BuildFromOutlineList(EDGEPT* outline) {
  TESSLINE* result = new TESSLINE;
@ -262,6 +266,36 @@ TBLOB* TBLOB::PolygonalCopy(C_BLOB* src) {
  return tblob;
 }

+// Normalizes the blob for classification only if needed.
+// (Normally this means a non-zero classify rotation.)
+// If no Normalization is needed, then NULL is returned, and the denorm is
+// unchanged. Otherwise a new TBLOB is returned and the denorm points to
+// a new DENORM. In this case, both the TBLOB and DENORM must be deleted.
+TBLOB* TBLOB::ClassifyNormalizeIfNeeded(const DENORM** denorm) const {
+  TBLOB* rotated_blob = NULL;
+  // If necessary, copy the blob and rotate it. The rotation is always
+  // +/- 90 degrees, as 180 was already taken care of.
+  if ((*denorm)->block() != NULL &&
+      (*denorm)->block()->classify_rotation().y() != 0.0) {
+    TBOX box = bounding_box();
+    int x_middle = (box.left() + box.right()) / 2;
+    int y_middle = (box.top() + box.bottom()) / 2;
+    rotated_blob = new TBLOB(*this);
+    const FCOORD& rotation = (*denorm)->block()->classify_rotation();
+    DENORM* norm = new DENORM;
+    // Move the rotated blob back to the same y-position so that we
+    // can still distinguish similar glyphs with differeny y-position.
+    float target_y = kBlnBaselineOffset +
+        (rotation.y() > 0 ? x_middle - box.left() : box.right() - x_middle);
+    norm->SetupNormalization(NULL, NULL, &rotation, *denorm, NULL, 0,
+                             x_middle, y_middle, 1.0f, 1.0f, 0.0f, target_y);
+    //                             x_middle, y_middle, 1.0f, 1.0f, 0.0f, y_middle);
+    rotated_blob->Normalize(*norm);
+    *denorm = norm;
+  }
+  return rotated_blob;
+}
+
 // Copies the data and the outline, but leaves next untouched.
 void TBLOB::CopyFrom(const TBLOB& src) {
  Clear();
@ -289,7 +323,7 @@ void TBLOB::Clear() {
 void TBLOB::Normalize(const DENORM& denorm) {
  // TODO(rays) outline->Normalize is more accurate, but breaks tests due
  // the changes it makes. Reinstate this code with a retraining.
-#if 0
+#if 1
  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) {
    outline->Normalize(denorm);
  }
@ -334,11 +368,20 @@ int TBLOB::NumOutlines() const {
  return result;
 }

+/**********************************************************************
+ * TBLOB::bounding_box()
+ *
+ * Compute the bounding_box of a compound blob, defined to be the
+ * bounding box of the union of all top-level outlines in the blob.
+ **********************************************************************/
 TBOX TBLOB::bounding_box() const {
-  TPOINT topleft;
-  TPOINT botright;
-  blob_bounding_box(this, &topleft, &botright);
-  TBOX box(topleft.x, botright.y, botright.x, topleft.y);
+  if (outlines == NULL)
+    return TBOX(0, 0, 0, 0);
+  TESSLINE *outline = outlines;
+  TBOX box = outline->bounding_box();
+  for (outline = outline->next; outline != NULL; outline = outline->next) {
+    box += outline->bounding_box();
+  }
  return box;
 }

@ -482,91 +525,10 @@ void TWERD::plot(ScrollView* window) {
 **********************************************************************/
 void blob_origin(TBLOB *blob,       /*blob to compute on */
                 TPOINT *origin) {  /*return value */
-  TPOINT topleft;                /*bounding box */
-  TPOINT botright;
-
-                                 /*find bounding box */
-  blob_bounding_box(blob, &topleft, &botright); 
-                                 /*centre of box */
-  origin->x = (topleft.x + botright.x) / 2;
-  origin->y = (topleft.y + botright.y) / 2;
+  TBOX bbox = blob->bounding_box();
+  *origin = (bbox.topleft() + bbox.botright()) / 2;
 }

-
-/**********************************************************************
- * blob_bounding_box
- *
- * Compute the bounding_box of a compound blob, define to be the
- * max coordinate value of the bounding boxes of all the top-level
- * outlines in the box.
- **********************************************************************/
-void blob_bounding_box(const TBLOB *blob,         // blob to compute on.
-                       TPOINT *topleft,           // bounding box.
-                       TPOINT *botright) {
-  register TESSLINE *outline;    // Current outline.
-
-  if (blob == NULL || blob->outlines == NULL) {
-    topleft->x = topleft->y = 0;
-    *botright = *topleft;        // Default value.
-  } else {
-    outline = blob->outlines;
-    *topleft = outline->topleft;
-    *botright = outline->botright;
-    for (outline = outline->next; outline != NULL; outline = outline->next) {
-      UpdateRange(outline->topleft.x, outline->botright.x,
-                  &topleft->x, &botright->x);
-      UpdateRange(outline->botright.y, outline->topleft.y,
-                  &botright->y, &topleft->y);
-    }
-  }
-}
-
-
-/**********************************************************************
- * blobs_bounding_box
- *
- * Return the smallest extreme point that contain this word.
- **********************************************************************/
-void blobs_bounding_box(TBLOB *blobs, TPOINT *topleft, TPOINT *botright) { 
-  TPOINT tl;
-  TPOINT br;
-  /* Start with first blob */
-  blob_bounding_box(blobs, topleft, botright); 
-
-  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { 
-    blob_bounding_box(blob, &tl, &br); 
-
-    if (tl.x < topleft->x)
-      topleft->x = tl.x;
-    if (tl.y > topleft->y)
-      topleft->y = tl.y;
-    if (br.x > botright->x)
-      botright->x = br.x;
-    if (br.y < botright->y)
-      botright->y = br.y;
-  }
-}
-
-
-/**********************************************************************
- * blobs_origin
- *
- * Compute the origin of a compound blob, define to be the centre
- * of the bounding box.
- **********************************************************************/
-void blobs_origin(TBLOB *blobs,      /*blob to compute on */
-                  TPOINT *origin) {  /*return value */
-  TPOINT topleft;                /*bounding box */
-  TPOINT botright;
-
-                                 /*find bounding box */
-  blobs_bounding_box(blobs, &topleft, &botright); 
-                                 /*center of box */
-  origin->x = (topleft.x + botright.x) / 2;
-  origin->y = (topleft.y + botright.y) / 2;
-}
-
-
 /**********************************************************************
 * blobs_widths
 *
@ -585,18 +547,18 @@ WIDTH_RECORD *blobs_widths(TBLOB *blobs) {  /*blob to compute on */
  width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
  width_record->num_chars = num_blobs;

-  blob_bounding_box(blobs, &topleft, &botright); 
-  width_record->widths[i++] = botright.x - topleft.x;
+  TBOX bbox = blobs->bounding_box();
+  width_record->widths[i++] = bbox.width();
  /* First width */
-  blob_end = botright.x;
+  blob_end = bbox.right();

  for (TBLOB* blob = blobs->next; blob != NULL; blob = blob->next) {
-    blob_bounding_box(blob, &topleft, &botright); 
-    width_record->widths[i++] = topleft.x - blob_end;
-    width_record->widths[i++] = botright.x - topleft.x;
-    blob_end = botright.x;
+    TBOX curbox = blob->bounding_box();
+    width_record->widths[i++] = curbox.left() - blob_end;
+    width_record->widths[i++] = curbox.width();
+    blob_end = curbox.right();
  }
-  return (width_record);
+  return width_record;
 }


@ -630,8 +592,9 @@ bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location) {
       outline1 = outline1->next) {
    if (outline1->is_hole)
      continue;  // Holes do not count as separable.
-    TPOINT mid_pt1 = {(outline1->topleft.x + outline1->botright.x) / 2,
-                      (outline1->topleft.y + outline1->botright.y) / 2};
+    TPOINT mid_pt1(
+      static_cast<inT16>((outline1->topleft.x + outline1->botright.x) / 2),
+      static_cast<inT16>((outline1->topleft.y + outline1->botright.y) / 2));
    int mid_prod1 = CROSS(mid_pt1, vertical);
    int min_prod1, max_prod1;
    outline1->MinMaxCrossProduct(vertical, &min_prod1, &max_prod1);
@ -639,15 +602,16 @@ bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location) {
         outline2 = outline2->next) {
      if (outline2->is_hole)
        continue;  // Holes do not count as separable.
-      TPOINT mid_pt2 = {  (outline2->topleft.x + outline2->botright.x) / 2,
-                        (outline2->topleft.y + outline2->botright.y) / 2};
+      TPOINT mid_pt2(
+        static_cast<inT16>((outline2->topleft.x + outline2->botright.x) / 2),
+        static_cast<inT16>((outline2->topleft.y + outline2->botright.y) / 2));
      int mid_prod2 = CROSS(mid_pt2, vertical);
      int min_prod2, max_prod2;
      outline2->MinMaxCrossProduct(vertical, &min_prod2, &max_prod2);
      int mid_gap = abs(mid_prod2 - mid_prod1);
      int overlap = MIN(max_prod1, max_prod2) - MAX(min_prod1, min_prod2);
-      if (mid_gap - overlap / 2 > max_gap) {
-        max_gap = mid_gap - overlap / 2;
+      if (mid_gap - overlap / 4 > max_gap) {
+        max_gap = mid_gap - overlap / 4;
        *location = mid_pt1;
        *location += mid_pt2;
        *location /= 2;
@ -679,8 +643,9 @@ void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
  int location_prod = CROSS(location, vertical);

  while (outline != NULL) {
-    TPOINT mid_pt = {(outline->topleft.x + outline->botright.x) / 2,
-                     (outline->topleft.y + outline->botright.y) / 2};
+    TPOINT mid_pt(
+      static_cast<inT16>((outline->topleft.x + outline->botright.x) / 2),
+      static_cast<inT16>((outline->topleft.y + outline->botright.y) / 2));
    int mid_prod = CROSS(mid_pt, vertical);
    if (mid_prod < location_prod) {
      // Outline is in left blob.
@ -705,4 +670,3 @@ void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
  if (outline2)
    outline2->next = NULL;
 }
-
--- a/ccstruct/blobs.h
+++ b/ccstruct/blobs.h
@ -29,6 +29,7 @@
 /*----------------------------------------------------------------------
              I n c l u d e s
 ----------------------------------------------------------------------*/
+#include "clst.h"
 #include "rect.h"
 #include "vecfuncs.h"

@ -50,6 +51,10 @@ typedef struct
 } WIDTH_RECORD;

 struct TPOINT {
+  TPOINT(): x(0), y(0) {}
+  TPOINT(inT16 vx, inT16 vy) : x(vx), y(vy) {}
+  TPOINT(const ICOORD &ic) : x(ic.x()), y(ic.y()) {}
+
  void operator+=(const TPOINT& other) {
    x += other.x;
    y += other.y;
@ -102,6 +107,9 @@ struct EDGEPT {
  EDGEPT* prev;                  // clockwise element
 };

+// For use in chop and findseam to keep a list of which EDGEPTs were inserted.
+CLISTIZEH(EDGEPT);
+
 struct TESSLINE {
  TESSLINE() : is_hole(false), loop(NULL), next(NULL) {}
  TESSLINE(const TESSLINE& src) : loop(NULL), next(NULL) {
@ -176,6 +184,12 @@ struct TBLOB {
  // Factory to build a TBLOB from a C_BLOB with polygonal
  // approximation along the way.
  static TBLOB* PolygonalCopy(C_BLOB* src);
+  // Normalizes the blob for classification only if needed.
+  // (Normally this means a non-zero classify rotation.)
+  // If no Normalization is needed, then NULL is returned, and the denorm is
+  // unchanged. Otherwise a new TBLOB is returned and the denorm points to
+  // a new DENORM. In this case, both the TBLOB and DENORM must be deleted.
+  TBLOB* ClassifyNormalizeIfNeeded(const DENORM** denorm) const;
  // Copies the data and the outlines, but leaves next untouched.
  void CopyFrom(const TBLOB& src);
  // Deletes owned data.
@ -274,23 +288,12 @@ if (w) memfree (w)
 ----------------------------------------------------------------------*/
 // TODO(rays) This will become a member of TBLOB when TBLOB's definition
 // moves to blobs.h
-TBOX TBLOB_bounding_box(const TBLOB* blob);

-void blob_origin(TBLOB *blob,      /*blob to compute on */
-                 TPOINT *origin);  /*return value */
+// Returns the center of blob's bounding box in origin.
+void blob_origin(TBLOB *blob, TPOINT *origin);

                                 /*blob to compute on */
-void blob_bounding_box(const TBLOB *blob,
-                       TPOINT *topleft,  // Bounding box.
-                       TPOINT *botright);
-
-void blobs_bounding_box(TBLOB *blobs, TPOINT *topleft, TPOINT *botright); 
-
-void blobs_origin(TBLOB *blobs,     /*blob to compute on */
-                  TPOINT *origin);  /*return value */
-
-                                 /*blob to compute on */
-WIDTH_RECORD *blobs_widths(TBLOB *blobs); 
+WIDTH_RECORD *blobs_widths(TBLOB *blobs);

 bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location);

--- a/ccstruct/boxread.cpp
+++ b/ccstruct/boxread.cpp
@ -0,0 +1,164 @@
+/**********************************************************************
+ * File:        boxread.cpp
+ * Description: Read data from a box file.
+ * Author:      Ray Smith
+ * Created:     Fri Aug 24 17:47:23 PDT 2007
+ *
+ * (C) Copyright 2007, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "mfcpch.h"
+#include "boxread.h"
+#include <string.h>
+
+#include "rect.h"
+#include "strngs.h"
+#include "tprintf.h"
+#include "unichar.h"
+
+// Special char code used to identify multi-blob labels.
+static const char* kMultiBlobLabelCode = "WordStr";
+
+// Open the boxfile based on the given image filename.
+FILE* OpenBoxFile(const STRING& fname) {
+  STRING filename = fname;
+  const char *lastdot = strrchr(filename.string(), '.');
+  if (lastdot != NULL)
+    filename[lastdot - filename.string()] = '\0';
+
+  filename += ".box";
+  FILE* box_file = NULL;
+  if (!(box_file = fopen(filename.string(), "rb"))) {
+    CANTOPENFILE.error("read_next_box", TESSEXIT,
+                       "Cant open box file %s",
+                       filename.string());
+  }
+  return box_file;
+}
+
+// Box files are used ONLY DURING TRAINING, but by both processes of
+// creating tr files with tesseract, and unicharset_extractor.
+// ReadNextBox factors out the code to interpret a line of a box
+// file so that applybox and unicharset_extractor interpret the same way.
+// This function returns the next valid box file utf8 string and coords
+// and returns true, or false on eof (and closes the file).
+// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
+// for valid utf-8 and allows space or tab between fields.
+// utf8_str is set with the unichar string, and bounding box with the box.
+// If there are page numbers in the file, it reads them all.
+bool ReadNextBox(int *line_number, FILE* box_file,
+                 STRING* utf8_str, TBOX* bounding_box) {
+  return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
+}
+
+// As ReadNextBox above, but get a specific page number. (0-based)
+// Use -1 to read any page number. Files without page number all
+// read as if they are page 0.
+bool ReadNextBox(int target_page, int *line_number, FILE* box_file,
+                 STRING* utf8_str, TBOX* bounding_box) {
+  int page = 0;
+  char buff[kBoxReadBufSize];   // boxfile read buffer
+  char *buffptr = buff;
+
+  while (fgets(buff, sizeof(buff) - 1, box_file)) {
+    (*line_number)++;
+
+    buffptr = buff;
+    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
+    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
+      buffptr += 3;  // Skip unicode file designation.
+    // Check for blank lines in box file
+    while (*buffptr == ' ' || *buffptr == '\t')
+      buffptr++;
+    if (*buffptr != '\0') {
+      if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) {
+        tprintf("Box file format error on line %i; ignored\n", *line_number);
+        continue;
+      }
+      if (target_page >= 0 && target_page != page)
+        continue;  // Not on the appropriate page.
+      return true;  // Successfully read a box.
+    }
+  }
+  fclose(box_file);
+  return false;  // EOF
+}
+
+// Parses the given box file string into a page_number, utf8_str, and
+// bounding_box. Returns true on a successful parse.
+// The box file is assumed to contain box definitions, one per line, of the
+// following format for blob-level boxes:
+//   <UTF8 str> <left> <bottom> <right> <top> <page id>
+// and for word/line-level boxes:
+//   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+// See applyybox.cpp for more information.
+bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
+                     STRING* utf8_str, TBOX* bounding_box) {
+  *bounding_box = TBOX();       // Initialize it to empty.
+  *utf8_str = "";
+  char uch[kBoxReadBufSize];
+  const char *buffptr = boxfile_str;
+  // Read the unichar without messing up on Tibetan.
+  // According to issue 253 the utf-8 surrogates 85 and A0 are treated
+  // as whitespace by sscanf, so it is more reliable to just find
+  // ascii space and tab.
+  int uch_len = 0;
+  while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
+         uch_len < kBoxReadBufSize - 1) {
+    uch[uch_len++] = *buffptr++;
+  }
+  uch[uch_len] = '\0';
+  if (*buffptr != '\0') ++buffptr;
+  int x_min, y_min, x_max, y_max;
+  *page_number = 0;
+  int count = sscanf(buffptr, "%d %d %d %d %d",
+                 &x_min, &y_min, &x_max, &y_max, page_number);
+  if (count != 5 && count != 4) {
+    tprintf("Bad box coordinates in boxfile string!\n");
+    return false;
+  }
+  // Test for long space-delimited string label.
+  if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
+      (buffptr = strchr(buffptr, '#')) != NULL) {
+    strncpy(uch, buffptr + 1, kBoxReadBufSize);
+    chomp_string(uch);
+    uch_len = strlen(uch);
+  }
+  // Validate UTF8 by making unichars with it.
+  int used = 0;
+  while (used < uch_len) {
+    UNICHAR ch(uch + used, uch_len - used);
+    int new_used = ch.utf8_len();
+    if (new_used == 0) {
+      tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",
+              uch + used, uch[used], used + 1);
+      return false;
+    }
+    used += new_used;
+  }
+  *utf8_str = uch;
+  bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
+  return true;  // Successfully read a box.
+}
+
+// Creates a box file string from a unichar string, TBOX and page number.
+void MakeBoxFileStr(const char* unichar_str, const TBOX& box, int page_num,
+                    STRING* box_str) {
+  *box_str = unichar_str;
+  box_str->add_str_int(" ", box.left());
+  box_str->add_str_int(" ", box.bottom());
+  box_str->add_str_int(" ", box.right());
+  box_str->add_str_int(" ", box.top());
+  box_str->add_str_int(" ", page_num);
+}
+
--- a/ccstruct/boxread.h
+++ b/ccstruct/boxread.h
@ -0,0 +1,60 @@
+/**********************************************************************
+ * File:        boxread.cpp
+ * Description: Read data from a box file.
+ * Author:		Ray Smith
+ * Created:		Fri Aug 24 17:47:23 PDT 2007
+ *
+ * (C) Copyright 2007, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCUTIL_BOXREAD_H__
+#define TESSERACT_CCUTIL_BOXREAD_H__
+
+#include <stdio.h>
+#include "strngs.h"
+
+class STRING;
+class TBOX;
+
+// Size of buffer used to read a line from a box file.
+const int kBoxReadBufSize = 1024;
+
+// Open the boxfile based on the given image filename.
+FILE* OpenBoxFile(const STRING& fname);
+
+// ReadNextBox factors out the code to interpret a line of a box
+// file so that applybox and unicharset_extractor interpret the same way.
+// This function returns the next valid box file utf8 string and coords
+// and returns true, or false on eof (and closes the file).
+// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
+// for valid utf-8 and allows space or tab between fields.
+// utf8_str is set with the unichar string, and bounding box with the box.
+// If there are page numbers in the file, it reads them all.
+bool ReadNextBox(int *line_number, FILE* box_file,
+                 STRING* utf8_str, TBOX* bounding_box);
+// As ReadNextBox above, but get a specific page number. (0-based)
+// Use -1 to read any page number. Files without page number all
+// read as if they are page 0.
+bool ReadNextBox(int target_page, int *line_number, FILE* box_file,
+                 STRING* utf8_str, TBOX* bounding_box);
+
+// Parses the given box file string into a page_number, utf8_str, and
+// bounding_box. Returns true on a successful parse.
+bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
+                     STRING* utf8_str, TBOX* bounding_box);
+
+// Creates a box file string from a unichar string, TBOX and page number.
+void MakeBoxFileStr(const char* unichar_str, const TBOX& box, int page_num,
+                    STRING* box_str);
+
+#endif  // TESSERACT_CCUTIL_BOXREAD_H__
--- a/ccstruct/boxword.cpp
+++ b/ccstruct/boxword.cpp
@ -209,6 +209,13 @@ void BoxWord::DeleteBox(int index) {
  ComputeBoundingBox();
 }

+// Deletes all the boxes stored in BoxWord.
+void BoxWord::DeleteAllBoxes() {
+  length_ = 0;
+  boxes_.clear();
+  bbox_ = TBOX();
+}
+
 // Computes the bounding box of the word.
 void BoxWord::ComputeBoundingBox() {
  bbox_ = TBOX();
--- a/ccstruct/boxword.h
+++ b/ccstruct/boxword.h
@ -85,6 +85,9 @@ class BoxWord {
  // Recomputes the bounding box.
  void DeleteBox(int index);

+  // Deletes all the boxes stored in BoxWord.
+  void DeleteAllBoxes();
+
  // This and other putatively are the same, so call the (permanent) callback
  // for each blob index where the bounding boxes match.
  // The callback is deleted on completion.
--- a/ccstruct/coutln.cpp
+++ b/ccstruct/coutln.cpp
@ -623,7 +623,7 @@ void C_OUTLINE::RemoveSmallRecursive(int min_size, C_OUTLINE_IT* it) {

 // Renders the outline to the given pix, with left and top being
 // the coords of the upper-left corner of the pix.
-void C_OUTLINE::render(int left, int top, Pix* pix) {
+void C_OUTLINE::render(int left, int top, Pix* pix) const {
  ICOORD pos = start;
  for (int stepindex = 0; stepindex < stepcount; ++stepindex) {
    ICOORD next_step = step(stepindex);
@ -638,6 +638,25 @@ void C_OUTLINE::render(int left, int top, Pix* pix) {
  }
 }

+// Renders just the outline to the given pix (no fill), with left and top
+// being the coords of the upper-left corner of the pix.
+void C_OUTLINE::render_outline(int left, int top, Pix* pix) const {
+  ICOORD pos = start;
+  for (int stepindex = 0; stepindex < stepcount; ++stepindex) {
+    ICOORD next_step = step(stepindex);
+    if (next_step.y() < 0) {
+      pixSetPixel(pix, pos.x() - left, top - pos.y(), 1);
+    } else if (next_step.y() > 0) {
+      pixSetPixel(pix, pos.x() - left - 1, top - pos.y() - 1, 1);
+    } else if (next_step.x() < 0) {
+      pixSetPixel(pix, pos.x() - left - 1, top - pos.y(), 1);
+    } else if (next_step.x() > 0) {
+      pixSetPixel(pix, pos.x() - left, top - pos.y() - 1, 1);
+    }
+    pos += next_step;
+  }
+}
+
 /**********************************************************************
 * C_OUTLINE::plot
 *
--- a/ccstruct/coutln.h
+++ b/ccstruct/coutln.h
@ -152,7 +152,11 @@ class DLLSYM C_OUTLINE:public ELIST_LINK

    // Renders the outline to the given pix, with left and top being
    // the coords of the upper-left corner of the pix.
-    void render(int left, int top, Pix* pix);
+    void render(int left, int top, Pix* pix) const;
+
+    // Renders just the outline to the given pix (no fill), with left and top
+    // being the coords of the upper-left corner of the pix.
+    void render_outline(int left, int top, Pix* pix) const;

    void plot(                       //draw one
              ScrollView* window,         //window to draw in
--- a/ccstruct/fontinfo.cpp
+++ b/ccstruct/fontinfo.cpp
@ -0,0 +1,162 @@
+///////////////////////////////////////////////////////////////////////
+// File:        fontinfo.cpp
+// Description: Font information classes abstracted from intproto.h/cpp.
+// Author:      rays@google.com (Ray Smith)
+// Created:     Wed May 18 10:39:01 PDT 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "fontinfo.h"
+
+namespace tesseract {
+
+// Compare FontInfo structures.
+bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2) {
+  // The font properties are required to be the same for two font with the same
+  // name, so there is no need to test them.
+  // Consequently, querying the table with only its font name as information is
+  // enough to retrieve its properties.
+  return strcmp(fi1.name, fi2.name) == 0;
+}
+// Compare FontSet structures.
+bool CompareFontSet(const FontSet& fs1, const FontSet& fs2) {
+  if (fs1.size != fs2.size)
+    return false;
+  for (int i = 0; i < fs1.size; ++i) {
+    if (fs1.configs[i] != fs2.configs[i])
+      return false;
+  }
+  return true;
+}
+
+// Callbacks for GenericVector.
+void FontInfoDeleteCallback(FontInfo f) {
+  if (f.spacing_vec != NULL) {
+    f.spacing_vec->delete_data_pointers();
+    delete f.spacing_vec;
+  }
+  delete[] f.name;
+}
+void FontSetDeleteCallback(FontSet fs) {
+  delete[] fs.configs;
+}
+
+/*---------------------------------------------------------------------------*/
+// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
+bool read_info(FILE* f, FontInfo* fi, bool swap) {
+  inT32 size;
+  if (fread(&size, sizeof(size), 1, f) != 1) return false;
+  if (swap)
+    Reverse32(&size);
+  char* font_name = new char[size + 1];
+  fi->name = font_name;
+  if (fread(font_name, sizeof(*font_name), size, f) != size) return false;
+  font_name[size] = '\0';
+  if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
+  if (swap)
+    Reverse32(&fi->properties);
+  return true;
+}
+
+bool write_info(FILE* f, const FontInfo& fi) {
+  inT32 size = strlen(fi.name);
+  if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
+  if (fwrite(fi.name, sizeof(*fi.name), size, f) != size) return false;
+  if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
+  return true;
+}
+
+bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) {
+  inT32 vec_size, kern_size;
+  if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
+  if (swap) Reverse32(&vec_size);
+  ASSERT_HOST(vec_size >= 0);
+  if (vec_size == 0) return true;
+  fi->init_spacing(vec_size);
+  for (int i = 0; i < vec_size; ++i) {
+    FontSpacingInfo *fs = new FontSpacingInfo();
+    if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
+        fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
+        fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
+      return false;
+    }
+    if (swap) {
+      ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
+      ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
+      Reverse32(&kern_size);
+    }
+    if (kern_size < 0) {  // indication of a NULL entry in fi->spacing_vec
+      delete fs;
+      continue;
+    }
+    if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
+                          !fs->kerned_x_gaps.DeSerialize(swap, f))) {
+      return false;
+    }
+    fi->add_spacing(i, fs);
+  }
+  return true;
+}
+
+bool write_spacing_info(FILE* f, const FontInfo& fi) {
+  inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
+  if (fwrite(&vec_size,  sizeof(vec_size), 1, f) != 1) return false;
+  inT16 x_gap_invalid = -1;
+  for (int i = 0; i < vec_size; ++i) {
+    FontSpacingInfo *fs = fi.spacing_vec->get(i);
+    inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
+    if (fs == NULL) {
+      if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
+          fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
+          fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
+        return false;
+      }
+    } else {
+      if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
+          fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
+          fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
+        return false;
+      }
+    }
+    if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
+                          !fs->kerned_x_gaps.Serialize(f))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool read_set(FILE* f, FontSet* fs, bool swap) {
+  if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
+  if (swap)
+    Reverse32(&fs->size);
+  fs->configs = new int[fs->size];
+  for (int i = 0; i < fs->size; ++i) {
+    if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
+    if (swap)
+      Reverse32(&fs->configs[i]);
+  }
+  return true;
+}
+
+bool write_set(FILE* f, const FontSet& fs) {
+  if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
+  for (int i = 0; i < fs.size; ++i) {
+    if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
+  }
+  return true;
+}
+
+}  // namespace tesseract.
+
--- a/ccstruct/fontinfo.h
+++ b/ccstruct/fontinfo.h
@ -0,0 +1,133 @@
+///////////////////////////////////////////////////////////////////////
+// File:        fontinfo.h
+// Description: Font information classes abstracted from intproto.h/cpp.
+// Author:      rays@google.com (Ray Smith)
+// Created:     Tue May 17 17:08:01 PDT 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+
+#ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
+#define TESSERACT_CCSTRUCT_FONTINFO_H_
+
+#include "genericvector.h"
+#include "host.h"
+#include "unichar.h"
+
+namespace tesseract {
+
+// Struct for information about spacing between characters in a particular font.
+struct FontSpacingInfo {
+  inT16 x_gap_before;
+  inT16 x_gap_after;
+  GenericVector<UNICHAR_ID> kerned_unichar_ids;
+  GenericVector<inT16> kerned_x_gaps;
+};
+
+/*
+ * font_properties contains properties about boldness, italicness, fixed pitch,
+ * serif, fraktur
+ */
+struct FontInfo {
+  FontInfo() : name(NULL), spacing_vec(NULL) {}
+  ~FontInfo() {}
+  // Reserves unicharset_size spots in spacing_vec.
+  void init_spacing(int unicharset_size) {
+    spacing_vec = new GenericVector<FontSpacingInfo *>();
+    spacing_vec->init_to_size(unicharset_size, NULL);
+  }
+  // Adds the given pointer to FontSpacingInfo to spacing_vec member
+  // (FontInfo class takes ownership of the pointer).
+  // Note: init_spacing should be called before calling this function.
+  void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
+    ASSERT_HOST(spacing_vec != NULL && spacing_vec->size() > uch_id);
+    (*spacing_vec)[uch_id] = spacing_info;
+  }
+
+  // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
+  const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
+    return (spacing_vec == NULL || spacing_vec->size() <= uch_id) ?
+        NULL : (*spacing_vec)[uch_id];
+  }
+
+  // Fills spacing with the value of the x gap expected between the two given
+  // UNICHAR_IDs. Returns true on success.
+  bool get_spacing(UNICHAR_ID prev_uch_id,
+                   UNICHAR_ID uch_id,
+                   int *spacing) const {
+    const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
+    const FontSpacingInfo *fsi = this->get_spacing(uch_id);
+    if (prev_fsi == NULL || fsi == NULL) return false;
+    int i = 0;
+    for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
+      if (prev_fsi->kerned_unichar_ids[i] == uch_id) break;
+    }
+    if (i < prev_fsi->kerned_unichar_ids.size()) {
+      *spacing = prev_fsi->kerned_x_gaps[i];
+    } else {
+      *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
+    }
+    return true;
+  }
+
+  bool is_italic() const { return properties & 1; }
+  bool is_bold() const { return (properties & 2) != 0; }
+  bool is_fixed_pitch() const { return (properties & 4) != 0; }
+  bool is_serif() const { return (properties & 8) != 0; }
+  bool is_fraktur() const { return (properties & 16) != 0; }
+
+  char* name;
+  uinT32 properties;
+  // The universal_id is a field reserved for the initialization process
+  // to assign a unique id number to all fonts loaded for the current
+  // combination of languages. This id will then be returned by
+  // ResultIterator::WordFontAttributes.
+  inT32 universal_id;
+  // Horizontal spacing between characters (indexed by UNICHAR_ID).
+  GenericVector<FontSpacingInfo *> *spacing_vec;
+};
+
+// Every class (character) owns a FontSet that represents all the fonts that can
+// render this character.
+// Since almost all the characters from the same script share the same set of
+// fonts, the sets are shared over multiple classes (see
+// Classify::fontset_table_). Thus, a class only store an id to a set.
+// Because some fonts cannot render just one character of a set, there are a
+// lot of FontSet that differ only by one font. Rather than storing directly
+// the FontInfo in the FontSet structure, it's better to share FontInfos among
+// FontSets (Classify::fontinfo_table_).
+struct FontSet {
+  int           size;
+  int*          configs;  // FontInfo ids
+};
+
+// Compare FontInfo structures.
+bool CompareFontInfo(const FontInfo& fi1, const FontInfo& fi2);
+// Compare FontSet structures.
+bool CompareFontSet(const FontSet& fs1, const FontSet& fs2);
+// Deletion callbacks for GenericVector.
+void FontInfoDeleteCallback(FontInfo f);
+void FontSetDeleteCallback(FontSet fs);
+
+// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
+bool read_info(FILE* f, FontInfo* fi, bool swap);
+bool write_info(FILE* f, const FontInfo& fi);
+bool read_spacing_info(FILE *f, FontInfo* fi, bool swap);
+bool write_spacing_info(FILE* f, const FontInfo& fi);
+bool read_set(FILE* f, FontSet* fs, bool swap);
+bool write_set(FILE* f, const FontSet& fs);
+
+}  // namespace tesseract.
+
+#endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */
--- a/ccstruct/matrix.cpp
+++ b/ccstruct/matrix.cpp
@ -33,7 +33,7 @@
 #include "unicharset.h"

 // Print the best guesses out of the match rating matrix.
-void MATRIX::print(const UNICHARSET &unicharset) {
+void MATRIX::print(const UNICHARSET &unicharset) const {
  tprintf("Ratings Matrix (top choices)\n");
  int row, col;
  for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col);
--- a/ccstruct/matrix.h
+++ b/ccstruct/matrix.h
@ -101,9 +101,11 @@ class GENERIC_2D_ARRAY {
  int dim1() const { return dim1_; }
  int dim2() const { return dim2_; }

-  // Expression to select a specific location in the matrix.
+  // Expression to select a specific location in the matrix. The matrix is
+  // stored COLUMN-major, so the left-most index is the most significant.
+  // This allows [][] access to use indices in the same order as (,).
  int index(int column, int row) const {
-    return (row * dim1_ + column);
+    return (column * dim2_ + row);
  }

  // Put a list element into the matrix at a specific location.
@ -122,6 +124,11 @@ class GENERIC_2D_ARRAY {
  T& operator()(int column, int row) {
    return array_[this->index(column, row)];
  }
+  // Allow access using array[column][row]. NOTE that the indices are
+  // in the same left-to-right order as the () indexing.
+  T* operator[](int column) {
+    return &array_[this->index(column, 0)];
+  }

  // Delete objects pointed to by array_[i].
  void delete_matrix_pointers() {
@ -188,7 +195,7 @@ class MATRIX : public GENERIC_MATRIX<BLOB_CHOICE_LIST *> {
  MATRIX(int dimension) : GENERIC_MATRIX<BLOB_CHOICE_LIST *>(dimension,
                                                             NOT_CLASSIFIED) {}
  // Print a shortened version of the contents of the matrix.
-  void print(const UNICHARSET &unicharset);
+  void print(const UNICHARSET &unicharset) const;
 };

 struct MATRIX_COORD {
--- a/ccstruct/normalis.cpp
+++ b/ccstruct/normalis.cpp
@ -24,7 +24,9 @@

 #include "allheaders.h"
 #include "blobs.h"
+#include "helpers.h"
 #include "ocrblock.h"
+#include "unicharset.h"
 #include "werd.h"


@ -254,7 +256,9 @@ void DENORM::LocalNormBlob(TBLOB* blob) const {
  blob->Move(translation);
  // Note that the old way of scaling only allowed for a single
  // scale factor.
-  blob->Scale(YScaleAtOrigX(x_center));
+  float scale = YScaleAtOrigX(x_center);
+  if (scale != 1.0f)
+    blob->Scale(scale);
  if (rotation_ != NULL)
    blob->Rotate(*rotation_);
  translation.set_x(IntCastRounded(final_xshift_));
@ -262,6 +266,54 @@ void DENORM::LocalNormBlob(TBLOB* blob) const {
  blob->Move(translation);
 }

+// Fills in the x-height range accepted by the given unichar_id, given its
+// bounding box in the usual baseline-normalized coordinates, with some
+// initial crude x-height estimate (such as word size) and this denoting the
+// transformation that was used. Returns false, and an empty range if the
+// bottom is a mis-fit. Returns true and empty [0, 0] range if the bottom
+// fits, but the top is impossible.
+bool DENORM::XHeightRange(int unichar_id, const UNICHARSET& unicharset,
+                          const TBOX& bbox,
+                          inT16* min_xht, inT16* max_xht) const {
+  // Clip the top and bottom to the limit of normalized feature space.
+  int top = ClipToRange<int>(bbox.top(), 0, kBlnCellHeight - 1);
+  int bottom = ClipToRange<int>(bbox.bottom(), 0, kBlnCellHeight - 1);
+  // A tolerance of yscale corresponds to 1 pixel in the image.
+  double tolerance = y_scale();
+  int min_bottom, max_bottom, min_top, max_top;
+  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
+                            &min_top, &max_top);
+  // Default returns indicate a mis-fit.
+  *min_xht = 0;
+  *max_xht = 0;
+  // Chars with a misfitting bottom might be sub/superscript/dropcap, or might
+  // just be wrongly classified. Return an empty range so they have to be
+  // good to be considered.
+  if (bottom < min_bottom - tolerance || bottom > max_bottom + tolerance) {
+    return false;
+  }
+  // To help very high cap/xheight ratio fonts accept the correct x-height,
+  // and to allow the large caps in small caps to accept the xheight of the
+  // small caps, add kBlnBaselineOffset to chars with a maximum max.
+  if (max_top == kBlnCellHeight - 1)
+    max_top += kBlnBaselineOffset;
+  int height = top - kBlnBaselineOffset;
+  double min_height = min_top - kBlnBaselineOffset - tolerance;
+  double max_height = max_top - kBlnBaselineOffset + tolerance;
+  if (min_height <= 0.0) {
+    if (height <= 0 || max_height > 0)
+      *max_xht = MAX_INT16;  // Anything will do.
+  } else if (height > 0) {
+    int result = IntCastRounded(height * kBlnXHeight / y_scale() / min_height);
+    *max_xht = static_cast<inT16>(ClipToRange(result, 0, MAX_INT16));
+  }
+  if (max_height > 0.0 && height > 0) {
+    int result = IntCastRounded(height * kBlnXHeight / y_scale() / max_height);
+    *min_xht = static_cast<inT16>(ClipToRange(result, 0, MAX_INT16));
+  }
+  return true;
+}
+
 // ============== Private Code ======================

 // Free allocated memory and clear pointers.
--- a/ccstruct/normalis.h
+++ b/ccstruct/normalis.h
@ -31,9 +31,21 @@ struct Pix;
 class ROW;                          // Forward decl
 class BLOCK;
 class FCOORD;
-struct TBLOB;
+class TBLOB;
 class TBOX;
-struct TPOINT;
+class TPOINT;
+class UNICHARSET;
+
+namespace tesseract {
+// Possible normalization methods. Use NEGATIVE values as these also
+// double up as markers for the last sub-classifier.
+enum NormalizationMode {
+  NM_BASELINE = -3,         // The original BL normalization mode.
+  NM_CHAR_ISOTROPIC = -2,   // Character normalization but isotropic.
+  NM_CHAR_ANISOTROPIC = -1  // The original CN normalization mode.
+};
+
+}  // namespace tesseract.

 class DENORM_SEG {
 public:
@ -219,6 +231,15 @@ class DENORM {
  // more accurately copies the old way.
  void LocalNormBlob(TBLOB* blob) const;

+  // Fills in the x-height range accepted by the given unichar_id, given its
+  // bounding box in the usual baseline-normalized coordinates, with some
+  // initial crude x-height estimate (such as word size) and this denoting the
+  // transformation that was used. Returns false, and an empty range if the
+  // bottom is a mis-fit. Returns true and empty [0, 0] range if the bottom
+  // fits, but the top is impossible.
+  bool XHeightRange(int unichar_id, const UNICHARSET& unicharset,
+                    const TBOX& bbox, inT16* min_xht, inT16* max_xht) const;
+
  Pix* pix() const {
    return pix_;
  }
@ -236,6 +257,9 @@ class DENORM {
      return predecessor_->RootDenorm();
    return this;
  }
+  const DENORM* predecessor() const {
+    return predecessor_;
+  }
  // Accessors - perhaps should not be needed.
  float x_scale() const {
    return x_scale_;
--- a/ccstruct/ocrblock.cpp
+++ b/ccstruct/ocrblock.cpp
@ -18,10 +18,11 @@
 **********************************************************************/

 #include "mfcpch.h"
-#include          <stdlib.h>
-#include          "blckerr.h"
-#include          "ocrblock.h"
-#include          "tprintf.h"
+#include <stdlib.h>
+#include "blckerr.h"
+#include "ocrblock.h"
+#include "stepblob.h"
+#include "tprintf.h"

 #define BLOCK_LABEL_HEIGHT  150  //char height of block id

@ -86,6 +87,17 @@ void BLOCK::rotate(const FCOORD& rotation) {
  box = *poly_block()->bounding_box();
 }

+/**
+ * BLOCK::reflect_polygon_in_y_axis
+ *
+ * Reflects the polygon in the y-axis and recompute the bounding_box.
+ * Does nothing to any contained rows/words/blobs etc.
+ */
+void BLOCK::reflect_polygon_in_y_axis() {
+  poly_block()->reflect_in_y_axis();
+  box = *poly_block()->bounding_box();
+}
+
 /**
 * BLOCK::sort_rows
 *
@ -219,6 +231,166 @@ const BLOCK & source             //from this
  return *this;
 }

+// This function is for finding the approximate (horizontal) distance from
+// the x-coordinate of the left edge of a symbol to the left edge of the
+// text block which contains it.  We are passed:
+//   segments - output of PB_LINE_IT::get_line() which contains x-coordinate
+//       intervals for the scan line going through the symbol's y-coordinate.
+//       Each element of segments is of the form (x()=start_x, y()=length).
+//   x - the x coordinate of the symbol we're interested in.
+//   margin - return value, the distance from x,y to the left margin of the
+//       block containing it.
+// If all segments were to the right of x, we return false and 0.
+bool LeftMargin(ICOORDELT_LIST *segments, int x, int *margin) {
+  bool found = false;
+  *margin = 0;
+  if (segments->empty())
+    return found;
+  ICOORDELT_IT seg_it(segments);
+  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
+    int cur_margin = x - seg_it.data()->x();
+    if (cur_margin >= 0) {
+      if (!found) {
+        *margin = cur_margin;
+      } else if (cur_margin < *margin) {
+        *margin = cur_margin;
+      }
+      found = true;
+    }
+  }
+  return found;
+}
+
+// This function is for finding the approximate (horizontal) distance from
+// the x-coordinate of the right edge of a symbol to the right edge of the
+// text block which contains it.  We are passed:
+//   segments - output of PB_LINE_IT::get_line() which contains x-coordinate
+//       intervals for the scan line going through the symbol's y-coordinate.
+//       Each element of segments is of the form (x()=start_x, y()=length).
+//   x - the x coordinate of the symbol we're interested in.
+//   margin - return value, the distance from x,y to the right margin of the
+//       block containing it.
+// If all segments were to the left of x, we return false and 0.
+bool RightMargin(ICOORDELT_LIST *segments, int x, int *margin) {
+  bool found = false;
+  *margin = 0;
+  if (segments->empty())
+    return found;
+  ICOORDELT_IT seg_it(segments);
+  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
+    int cur_margin = seg_it.data()->x() + seg_it.data()->y() - x;
+    if (cur_margin >= 0) {
+      if (!found) {
+        *margin = cur_margin;
+      } else if (cur_margin < *margin) {
+        *margin = cur_margin;
+      }
+      found = true;
+    }
+  }
+  return found;
+}
+
+// Compute the distance from the left and right ends of each row to the
+// left and right edges of the block's polyblock.  Illustration:
+//  ____________________________   _______________________
+//  |  Howdy neighbor!         |  |rectangular blocks look|
+//  |  This text is  written to|  |more like stacked pizza|
+//  |illustrate how useful poly-  |boxes.                 |
+//  |blobs  are   in -----------  ------   The    polyblob|
+//  |dealing    with|     _________     |for a BLOCK  rec-|
+//  |harder   layout|   /===========\   |ords the possibly|
+//  |issues.        |    |  _    _  |   |skewed    pseudo-|
+//  |  You  see this|    | |_| \|_| |   |rectangular      |
+//  |text is  flowed|    |      }   |   |boundary     that|
+//  |around  a  mid-|     \   ____  |   |forms the  ideal-|
+//  |cloumn portrait._____ \       /  __|ized  text margin|
+//  |  Polyblobs     exist| \    /   |from which we should|
+//  |to account for insets|  |   |   |measure    paragraph|
+//  |which make  otherwise|  -----   |indentation.        |
+//  -----------------------          ----------------------
+//
+// If we identify a drop-cap, we measure the left margin for the lines
+// below the first line relative to one space past the drop cap.  The
+// first line's margin and those past the drop cap area are measured
+// relative to the enclosing polyblock.
+//
+// TODO(rays): Before this will work well, we'll need to adjust the
+//             polyblob tighter around the text near images, as in:
+//             UNLV_AUTO:mag.3G0  page 2
+//             UNLV_AUTO:mag.3G4  page 16
+void BLOCK::compute_row_margins() {
+  if (row_list()->empty() || row_list()->singleton()) {
+    return;
+  }
+
+  // If Layout analysis was not called, default to this.
+  POLY_BLOCK rect_block(bounding_box(), PT_FLOWING_TEXT);
+  POLY_BLOCK *pblock = &rect_block;
+  if (poly_block() != NULL) {
+    pblock = poly_block();
+  }
+
+  // Step One: Determine if there is a drop-cap.
+  //           TODO(eger): Fix up drop cap code for RTL languages.
+  ROW_IT r_it(row_list());
+  ROW *first_row = r_it.data();
+  ROW *second_row = r_it.data_relative(1);
+
+  // initialize the bottom of a fictitious drop cap far above the first line.
+  int drop_cap_bottom = first_row->bounding_box().top() +
+                        first_row->bounding_box().height();
+  int drop_cap_right = first_row->bounding_box().left();
+  int mid_second_line = second_row->bounding_box().top() -
+                        second_row->bounding_box().height() / 2;
+  WERD_IT werd_it(r_it.data()->word_list());  // words of line one
+  if (!werd_it.empty()) {
+    C_BLOB_IT cblob_it(werd_it.data()->cblob_list());
+    for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list();
+         cblob_it.forward()) {
+      TBOX bbox = cblob_it.data()->bounding_box();
+      if (bbox.bottom() <= mid_second_line) {
+        // we found a real drop cap
+        first_row->set_has_drop_cap(true);
+        if (drop_cap_bottom >  bbox.bottom())
+          drop_cap_bottom = bbox.bottom();
+        if (drop_cap_right < bbox.right())
+          drop_cap_right = bbox.right();
+      }
+    }
+  }
+
+  // Step Two: Calculate the margin from the text of each row to the block
+  //           (or drop-cap) boundaries.
+  PB_LINE_IT lines(pblock);
+  r_it.set_to_list(row_list());
+  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+    ROW *row = r_it.data();
+    TBOX row_box = row->bounding_box();
+    int left_y = row->base_line(row_box.left()) + row->x_height();
+    int left_margin;
+    ICOORDELT_LIST *segments = lines.get_line(left_y);
+    LeftMargin(segments, row_box.left(), &left_margin);
+    delete segments;
+
+    if (row_box.top() >= drop_cap_bottom) {
+      int drop_cap_distance = row_box.left() - row->space() - drop_cap_right;
+      if (drop_cap_distance < 0)
+        drop_cap_distance = 0;
+      if (drop_cap_distance < left_margin)
+        left_margin = drop_cap_distance;
+    }
+
+    int right_y = row->base_line(row_box.right()) + row->x_height();
+    int right_margin;
+    segments = lines.get_line(right_y);
+    RightMargin(segments, row_box.right(), &right_margin);
+    delete segments;
+    row->set_lmargin(left_margin);
+    row->set_rmargin(right_margin);
+  }
+}
+
 /**********************************************************************
 * PrintSegmentationStats
 *
--- a/ccstruct/ocrblock.h
+++ b/ccstruct/ocrblock.h
@ -21,6 +21,7 @@
 #define           OCRBLOCK_H

 #include          "img.h"
+#include          "ocrpara.h"
 #include          "ocrrow.h"
 #include          "pdblock.h"

@ -120,6 +121,14 @@ class BLOCK:public ELIST_LINK, public PDBLK
  ROW_LIST *row_list() {
    return &rows;
  }
+  // Compute the margins between the edges of each row and this block's
+  // polyblock, and store the results in the rows.
+  void compute_row_margins();
+
+  // get paragraphs
+  PARA_LIST *para_list() {
+    return &paras_;
+  }
  /// get blobs
  C_BLOB_LIST *blob_list() {
    return &c_blobs;
@ -157,6 +166,10 @@ class BLOCK:public ELIST_LINK, public PDBLK
    return PDBLK::render_mask(re_rotation_);
  }

+  // Reflects the polygon in the y-axis and recomputes the bounding_box.
+  // Does nothing to any contained rows/words/blobs etc.
+  void reflect_polygon_in_y_axis();
+
  void rotate(const FCOORD& rotation);

  /// decreasing y order
@ -187,6 +200,7 @@ class BLOCK:public ELIST_LINK, public PDBLK
  float cell_over_xheight_;    //< Ratio of cell height to xheight.
  STRING filename;             //< name of block
  ROW_LIST rows;               //< rows in block
+  PARA_LIST paras_;            //< paragraphs of block
  C_BLOB_LIST c_blobs;         //< before textord
  C_BLOB_LIST rej_blobs;       //< duff stuff
  FCOORD re_rotation_;         //< How to transform coords back to image.
--- a/ccstruct/ocrpara.cpp
+++ b/ccstruct/ocrpara.cpp
@ -0,0 +1,100 @@
+/////////////////////////////////////////////////////////////////////
+// File:        ocrpara.h
+// Description: OCR Paragraph Output Type
+// Author:      David Eger
+// Created:     2010-11-15
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+
+#include "ocrpara.h"
+#include "host.h"  // For NearlyEqual()
+
+ELISTIZE(PARA)
+
+using tesseract::JUSTIFICATION_LEFT;
+using tesseract::JUSTIFICATION_RIGHT;
+using tesseract::JUSTIFICATION_CENTER;
+using tesseract::JUSTIFICATION_UNKNOWN;
+
+static STRING ParagraphJustificationToString(
+    tesseract::ParagraphJustification justification) {
+  switch (justification) {
+    case JUSTIFICATION_LEFT:
+      return "LEFT";
+    case JUSTIFICATION_RIGHT:
+      return "RIGHT";
+    case JUSTIFICATION_CENTER:
+      return "CENTER";
+    default:
+      return "UNKNOWN";
+  }
+}
+
+bool ParagraphModel::ValidFirstLine(int lmargin, int lindent,
+                                    int rindent, int rmargin) const {
+  switch (justification_) {
+    case JUSTIFICATION_LEFT:
+      return NearlyEqual(lmargin + lindent, margin_ + first_indent_,
+                         tolerance_);
+    case JUSTIFICATION_RIGHT:
+      return NearlyEqual(rmargin + rindent, margin_ + first_indent_,
+                         tolerance_);
+    case JUSTIFICATION_CENTER:
+      return NearlyEqual(lindent, rindent, tolerance_ * 2);
+    default:
+      // shouldn't happen
+      return false;
+  }
+}
+
+bool ParagraphModel::ValidBodyLine(int lmargin, int lindent,
+                                   int rindent, int rmargin) const {
+  switch (justification_) {
+    case JUSTIFICATION_LEFT:
+      return NearlyEqual(lmargin + lindent, margin_ + body_indent_,
+                         tolerance_);
+    case JUSTIFICATION_RIGHT:
+      return NearlyEqual(rmargin + rindent, margin_ + body_indent_,
+                         tolerance_);
+    case JUSTIFICATION_CENTER:
+      return NearlyEqual(lindent, rindent, tolerance_ * 2);
+    default:
+      // shouldn't happen
+      return false;
+  }
+}
+
+bool ParagraphModel::Comparable(const ParagraphModel &other) const {
+  if (justification_ != other.justification_)
+    return false;
+  if (justification_ == JUSTIFICATION_CENTER ||
+      justification_ == JUSTIFICATION_UNKNOWN)
+    return true;
+  int tolerance = (tolerance_ + other.tolerance_) / 4;
+  return NearlyEqual(margin_ + first_indent_,
+                     other.margin_ + other.first_indent_, tolerance) &&
+         NearlyEqual(margin_ + body_indent_,
+                     other.margin_ + other.body_indent_, tolerance);
+}
+
+STRING ParagraphModel::ToString() const {
+  char buffer[200];
+  const STRING &alignment = ParagraphJustificationToString(justification_);
+  snprintf(buffer, sizeof(buffer),
+           "margin: %d, first_indent: %d, body_indent: %d, alignment: %s",
+           margin_, first_indent_, body_indent_, alignment.string());
+  return STRING(buffer);
+}
--- a/ccstruct/ocrpara.h
+++ b/ccstruct/ocrpara.h
@ -0,0 +1,191 @@
+/////////////////////////////////////////////////////////////////////
+// File:        ocrpara.h
+// Description: OCR Paragraph Output Type
+// Author:      David Eger
+// Created:     2010-11-15
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
+#define TESSERACT_CCSTRUCT_OCRPARA_H_
+
+#include "publictypes.h"
+#include "elst.h"
+#include "strngs.h"
+
+class ParagraphModel;
+
+struct PARA : public ELIST_LINK {
+ public:
+  PARA() : model(NULL), is_list_item(false),
+           is_very_first_or_continuation(false), has_drop_cap(false) {}
+
+  // We do not own the model, we just reference it.
+  // model may be NULL if there is not a good model for this paragraph.
+  const ParagraphModel *model;
+
+  bool is_list_item;
+
+  // The first paragraph on a page often lacks a first line indent, but should
+  // still be modeled by the same model as other body text paragraphs on the
+  // page.
+  bool is_very_first_or_continuation;
+
+  // Does this paragraph begin with a drop cap?
+  bool has_drop_cap;
+};
+
+ELISTIZEH(PARA)
+
+// A geometric model of paragraph indentation and alignment.
+//
+// Measurements are in pixels. The meaning of the integer arguments changes
+// depending upon the value of justification.  Distances less than or equal
+// to tolerance apart we take as "equivalent" for the purpose of model
+// matching, and in the examples below, we assume tolerance is zero.
+//
+// justification = LEFT:
+//   margin       the "ignored" margin to the left block edge.
+//   first_indent indent from the left margin to a typical first text line.
+//   body_indent  indent from the left margin of a typical body text line.
+//
+// justification = RIGHT:
+//   margin       the "ignored" margin to the right block edge.
+//   first_indent indent from the right margin to a typical first text line.
+//   body_indent  indent from the right margin of a typical body text line.
+//
+// justification = CENTER:
+//   margin       ignored
+//   first_indent ignored
+//   body_indent  ignored
+//
+//  ====== Extended example, assuming each letter is ten pixels wide: =======
+//
+// +--------------------------------+
+// |      Awesome                   | ParagraphModel(CENTER, 0, 0, 0)
+// |   Centered Title               |
+// | Paragraph Detection            |
+// |      OCR TEAM                  |
+// |  10 November 2010              |
+// |                                |
+// |  Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
+// |This paragraph starts at the top|
+// |of the page and takes 3 lines.  |
+// |  Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
+// |which indicates that the first  |
+// |paragraph is not a continuation |
+// |from a previous page, as it is  |
+// |indented just like this second  |
+// |paragraph.                      |
+// |   Here is a block quote. It    | ParagraphModel(LEFT, 30, 0, 0)
+// |   looks like the prior text    |
+// |   but it  is indented  more    |
+// |   and is fully justified.      |
+// |  So how does one deal with     | ParagraphModel(LEFT, 0, 20, 0)
+// |centered text, block quotes,    |
+// |normal paragraphs, and lists    |
+// |like what follows?              |
+// |1. Make a plan.                 | ParagraphModel(LEFT, 0, 0, 30)
+// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
+// |   looking for lines where the  |
+// |   first word of the next line  |
+// |   would fit on the previous    |
+// |   line.                        |
+// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
+// |   Python and try it out.       |
+// |4. Determine how to fix the     | ParagraphModel(LEFT, 0, 0, 30)
+// |   mistakes.                    |
+// |5. Repeat.                      | ParagraphModel(LEFT, 0, 0, 30)
+// |  For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
+// |you can try to identify source  |
+// |code.  Ouch!                    |
+// +--------------------------------+
+class ParagraphModel {
+ public:
+  ParagraphModel(tesseract::ParagraphJustification justification,
+                 int margin,
+                 int first_indent,
+                 int body_indent,
+                 int tolerance)
+      : justification_(justification),
+        margin_(margin),
+        first_indent_(first_indent),
+        body_indent_(body_indent),
+        tolerance_(tolerance) {
+    // Make one of {first_indent, body_indent} is 0.
+    int added_margin = first_indent;
+    if (body_indent < added_margin)
+      added_margin = body_indent;
+    margin_ += added_margin;
+    first_indent_ -= added_margin;
+    body_indent_ -= added_margin;
+  }
+
+  ParagraphModel()
+      : justification_(tesseract::JUSTIFICATION_UNKNOWN),
+         margin_(0),
+         first_indent_(0),
+         body_indent_(0),
+         tolerance_(0) { }
+
+  // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
+  // in a block of text which we are trying to model:
+  //   lmargin, lindent:  these add up to the distance from the leftmost ink
+  //                      in the text line to the surrounding text block's left
+  //                      edge.
+  //   rmargin, rindent:  these add up to the distance from the rightmost ink
+  //                      in the text line to the surrounding text block's right
+  //                      edge.
+  // The caller determines the division between "margin" and "indent", which
+  // only actually affect whether we think the line may be centered.
+  //
+  // If the amount of whitespace matches the amount of whitespace expected on
+  // the relevant side of the line (within tolerance_) we say it matches.
+
+  // Return whether a given text line could be a first paragraph line according
+  // to this paragraph model.
+  bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
+
+  // Return whether a given text line could be a first paragraph line according
+  // to this paragraph model.
+  bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
+
+  tesseract::ParagraphJustification justification() const {
+    return justification_;
+  }
+  int margin() const { return margin_; }
+  int first_indent() const { return first_indent_; }
+  int body_indent() const { return body_indent_; }
+  int tolerance() const { return tolerance_; }
+  bool is_flush() const {
+    return (justification_ == tesseract::JUSTIFICATION_LEFT ||
+            justification_ == tesseract::JUSTIFICATION_RIGHT) &&
+        abs(first_indent_ - body_indent_) <= tolerance_;
+  }
+
+  // Return whether this model is likely to agree with the other model on most
+  // paragraphs they are marked.
+  bool Comparable(const ParagraphModel &other) const;
+
+  STRING ToString() const;
+
+ private:
+  tesseract::ParagraphJustification justification_;
+  int margin_;
+  int first_indent_;
+  int body_indent_;
+  int tolerance_;
+};
+
+#endif  // TESSERACT_CCSTRUCT_OCRPARA_H_
--- a/ccstruct/ocrrow.cpp
+++ b/ccstruct/ocrrow.cpp
@ -42,13 +42,18 @@ float ascenders,                 //ascender size
 float descenders,                //descender drop
 inT16 kern,                      //char gap
 inT16 space                      //word gap
-):
-baseline(spline_size, xstarts, coeffs) {
+)
+    : baseline(spline_size, xstarts, coeffs),
+      para_(NULL) {
  kerning = kern;                //just store stuff
  spacing = space;
  xheight = x_height;
  ascrise = ascenders;
+  bodysize = 0.0f;
  descdrop = descenders;
+  has_drop_cap_ = false;
+  lmargin_ = 0;
+  rmargin_ = 0;
 }


@ -63,13 +68,17 @@ ROW::ROW(                 //constructor
         TO_ROW *to_row,  //source row
         inT16 kern,      //char gap
         inT16 space      //word gap
-        ) {
+        ) : para_(NULL) {
  kerning = kern;                //just store stuff
  spacing = space;
  xheight = to_row->xheight;
+  bodysize = to_row->body_size;
  ascrise = to_row->ascrise;
  descdrop = to_row->descdrop;
  baseline = to_row->baseline;
+  has_drop_cap_ = false;
+  lmargin_ = 0;
+  rmargin_ = 0;
 }


@ -148,12 +157,14 @@ void ROW::move(                  // reposition row
 void ROW::print(          //print
                FILE *fp  //file to print on
               ) {
-  tprintf ("Kerning= %d\n", kerning);
-  tprintf ("Spacing= %d\n", spacing);
-  bound_box.print ();
-  tprintf ("Xheight= %f\n", xheight);
-  tprintf ("Ascrise= %f\n", ascrise);
-  tprintf ("Descdrop= %f\n", descdrop);
+  tprintf("Kerning= %d\n", kerning);
+  tprintf("Spacing= %d\n", spacing);
+  bound_box.print();
+  tprintf("Xheight= %f\n", xheight);
+  tprintf("Ascrise= %f\n", ascrise);
+  tprintf("Descdrop= %f\n", descdrop);
+  tprintf("has_drop_cap= %d\n", has_drop_cap_);
+  tprintf("lmargin= %d, rmargin= %d\n", lmargin_, rmargin_);
 }


@ -204,18 +215,21 @@ void ROW::plot(               //draw it
 * Assign rows by duplicating the row structure but NOT the WERDLIST
 **********************************************************************/

-ROW & ROW::operator= (           //assignment
-const ROW & source               //from this
-) {
+ROW & ROW::operator= (const ROW & source) {
  this->ELIST_LINK::operator= (source);
  kerning = source.kerning;
  spacing = source.spacing;
  xheight = source.xheight;
+  bodysize = source.bodysize;
  ascrise = source.ascrise;
  descdrop = source.descdrop;
  if (!words.empty ())
    words.clear ();
  baseline = source.baseline;    //QSPLINES must do =
  bound_box = source.bound_box;
+  has_drop_cap_ = source.has_drop_cap_;
+  lmargin_ = source.lmargin_;
+  rmargin_ = source.rmargin_;
+  para_ = source.para_;
  return *this;
 }
--- a/ccstruct/ocrrow.h
+++ b/ccstruct/ocrrow.h
@ -1,8 +1,8 @@
 /**********************************************************************
 * File:        ocrrow.h  (Formerly row.h)
 * Description: Code for the ROW class.
- * Author:		Ray Smith
- * Created:		Tue Oct 08 15:58:04 BST 1991
+ * Author:      Ray Smith
+ * Created:     Tue Oct 08 15:58:04 BST 1991
 *
 * (C) Copyright 1991, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
@ -20,12 +20,15 @@
 #ifndef           OCRROW_H
 #define           OCRROW_H

-#include          <stdio.h>
-#include          "quspline.h"
-#include          "werd.h"
+#include <stdio.h>
+
+#include "quspline.h"
+#include "werd.h"

 class TO_ROW;

+class PARA;
+
 class ROW:public ELIST_LINK
 {
  friend void tweak_row_baseline(ROW *, double, double);
@ -64,6 +67,12 @@ class ROW:public ELIST_LINK
    inT32 kern() const {  //return kerning
      return kerning;
    }
+    float body_size() const {  //return body size
+      return bodysize;
+    }
+    void set_body_size(float new_size) {  // set body size
+      bodysize = new_size;
+    }
    inT32 space() const {  //return spacing
      return spacing;
    }
@ -77,6 +86,33 @@ class ROW:public ELIST_LINK
      return bound_box;
    }

+    void set_lmargin(inT16 lmargin) {
+      lmargin_ = lmargin;
+    }
+    void set_rmargin(inT16 rmargin) {
+      rmargin_ = rmargin;
+    }
+    inT16 lmargin() const {
+      return lmargin_;
+    }
+    inT16 rmargin() const {
+      return rmargin_;
+    }
+
+    void set_has_drop_cap(bool has) {
+      has_drop_cap_ = has;
+    }
+    bool has_drop_cap() const {
+      return has_drop_cap_;
+    }
+
+    void set_para(PARA *p) {
+      para_ = p;
+    }
+    PARA *para() const {
+      return para_;
+    }
+
    void recalc_bounding_box();  //recalculate BB

    void move(                    // reposition row
@ -104,12 +140,22 @@ class ROW:public ELIST_LINK
  private:
    inT32 kerning;               //inter char gap
    inT32 spacing;               //inter word gap
-    TBOX bound_box;               //bounding box
+    TBOX bound_box;              //bounding box
    float xheight;               //height of line
    float ascrise;               //size of ascenders
    float descdrop;              //-size of descenders
+    float bodysize;              //CJK character size. (equals to
+                                 //xheight+ascrise by default)
    WERD_LIST words;             //words
    QSPLINE baseline;            //baseline spline
+
+    // These get set after blocks have been determined.
+    bool has_drop_cap_;
+    inT16 lmargin_;   // Distance to left polyblock margin.
+    inT16 rmargin_;   // Distance to right polyblock margin.
+
+    // This gets set during paragraph analysis.
+    PARA *para_;      // Paragraph of which this row is part.
 };

 ELISTIZEH (ROW)
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@ -24,6 +24,63 @@
 #include          "pageres.h"
 #include          "blobs.h"

+const char kBlameCorrect[] = "corr";
+const char kBlameClassifier[] = "cl";
+const char kBlameChopper[] = "chop";
+const char kBlameClassLMTradeoff[] = "cl/LM";
+const char kBlamePageLayout[] = "pglt";
+const char kBlameSegsearchHeur[] = "ss_heur";
+const char kBlameSegsearchPP[] = "ss_pp";
+const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
+const char kBlameAdaption[] = "adapt";
+const char kBlameNoTruthSplit[] = "no_tr_spl";
+const char kBlameNoTruth[] = "no_tr";
+const char kBlameUnknown[] = "unkn";
+
+const char * const kIncorrectResultReasonNames[] = {
+    kBlameCorrect,
+    kBlameClassifier,
+    kBlameChopper,
+    kBlameClassLMTradeoff,
+    kBlamePageLayout,
+    kBlameSegsearchHeur,
+    kBlameSegsearchPP,
+    kBlameClassOldLMTradeoff,
+    kBlameAdaption,
+    kBlameNoTruthSplit,
+    kBlameNoTruth,
+    kBlameUnknown
+};
+
+const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
+  return kIncorrectResultReasonNames[irr];
+}
+
+const char *BlamerBundle::IncorrectReason() const {
+  return kIncorrectResultReasonNames[incorrect_result_reason];
+}
+
+void BlamerBundle::FillDebugString(const STRING &msg,
+                                   const WERD_CHOICE *choice,
+                                   STRING *debug) {
+  (*debug) += "Truth ";
+  for (int i = 0; i < this->truth_text.length(); ++i) {
+    (*debug) += this->truth_text[i];
+  }
+  if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)";
+  if (choice != NULL) {
+    (*debug) += " Choice ";
+    STRING choice_str;
+    choice->string_and_lengths(&choice_str, NULL);
+    (*debug) += choice_str;
+  }
+  if (msg.length() > 0) {
+    (*debug) += "\n";
+    (*debug) += msg;
+  }
+  (*debug) += "\n";
+}
+
 ELISTIZE (BLOCK_RES)
 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)
 /*************************************************************************
@ -34,22 +91,16 @@ CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)
 PAGE_RES::PAGE_RES(
    BLOCK_LIST *the_block_list,
    WERD_CHOICE **prev_word_best_choice_ptr) {
+  Init();
  BLOCK_IT block_it(the_block_list);
  BLOCK_RES_IT block_res_it(&block_res_list);
-
-  char_count = 0;
-  rej_count = 0;
-  rejected = FALSE;
-
  for (block_it.mark_cycle_pt();
       !block_it.cycled_list(); block_it.forward()) {
    block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
  }
-
  prev_word_best_choice = prev_word_best_choice_ptr;
 }

-
 /*************************************************************************
 * BLOCK_RES::BLOCK_RES
 *
@ -72,8 +123,7 @@ BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
  block = the_block;

  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
-    row_res_it.add_to_end(new ROW_RES(the_block->right_to_left(),
-                                      row_it.data()));
+    row_res_it.add_to_end(new ROW_RES(row_it.data()));
  }
 }

@ -84,8 +134,7 @@ BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
 * Constructor for ROW results
 *************************************************************************/

-ROW_RES::ROW_RES(bool right_to_left,
-                 ROW *the_row) {
+ROW_RES::ROW_RES(ROW *the_row) {
  WERD_IT word_it(the_row->word_list());
  WERD_RES_IT word_res_it(&word_res_list);
  WERD_RES *combo = NULL;        // current combination of fuzzies
@ -97,13 +146,17 @@ ROW_RES::ROW_RES(bool right_to_left,
  whole_word_rej_count = 0;

  row = the_row;
-  if (right_to_left) {
-    word_it.move_to_last();
-    for (word_it.mark_cycle_pt(); !word_it.cycled_list();  word_it.backward()) {
-      word_res = new WERD_RES(word_it.data());
-      word_res->x_height = the_row->x_height();
-      // A FUZZY_NON marks the beginning of a combo if we are not in one.
-      if (combo == NULL && word_res->word->flag(W_FUZZY_NON)) {
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word_res = new WERD_RES(word_it.data());
+    word_res->x_height = the_row->x_height();
+
+    if (word_res->word->flag(W_FUZZY_NON)) {
+      ASSERT_HOST(combo != NULL);
+      word_res->part_of_combo = TRUE;
+      combo->copy_on(word_res);
+    }
+    if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
+      if (combo == NULL) {
        copy_word = new WERD;
                                 //deep copy
        *copy_word = *(word_it.data());
@ -111,42 +164,12 @@ ROW_RES::ROW_RES(bool right_to_left,
        combo->x_height = the_row->x_height();
        combo->combination = TRUE;
        word_res_it.add_to_end(combo);
-        word_res->part_of_combo = TRUE;
-      } else if (combo != NULL) {
-        word_res->part_of_combo = TRUE;
-        combo->copy_on(word_res);
-        // The first non FUZZY_NON is the last word in the combo.
-        if (!word_res->word->flag(W_FUZZY_NON))
-          combo = NULL;
      }
-      word_res_it.add_to_end(word_res);
-    }
-  } else {
-    for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
-      word_res = new WERD_RES(word_it.data());
-      word_res->x_height = the_row->x_height();
-
-      if (word_res->word->flag(W_FUZZY_NON)) {
-        ASSERT_HOST(combo != NULL);
-        word_res->part_of_combo = TRUE;
-        combo->copy_on(word_res);
-      }
-      if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
-        if (combo == NULL) {
-          copy_word = new WERD;
-                                   //deep copy
-          *copy_word = *(word_it.data());
-          combo = new WERD_RES(copy_word);
-          combo->x_height = the_row->x_height();
-          combo->combination = TRUE;
-          word_res_it.add_to_end(combo);
-        }
-        word_res->part_of_combo = TRUE;
-      } else {
-        combo = NULL;
-      }
-      word_res_it.add_to_end(word_res);
+      word_res->part_of_combo = TRUE;
+    } else {
+      combo = NULL;
    }
+    word_res_it.add_to_end(word_res);
  }
 }

@ -174,10 +197,8 @@ WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
  correct_text = source.correct_text;

  if (source.best_choice != NULL) {
-    best_choice = new WERD_CHOICE;
-    *best_choice = *(source.best_choice);
-    raw_choice = new WERD_CHOICE;
-    *raw_choice = *(source.raw_choice);
+    best_choice = new WERD_CHOICE(*source.best_choice);
+    raw_choice = new WERD_CHOICE(*source.raw_choice);
    best_choice_fontinfo_ids = source.best_choice_fontinfo_ids;
  }
  else {
@ -187,16 +208,24 @@ WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
      best_choice_fontinfo_ids.clear();
    }
  }
-  if (source.ep_choice != NULL) {
-    ep_choice = new WERD_CHOICE;
-    *ep_choice = *(source.ep_choice);
+  for (int i = 0; i < source.alt_choices.length(); ++i) {
+    const WERD_CHOICE *choice = source.alt_choices[i];
+    ASSERT_HOST(choice != NULL);
+    alt_choices.push_back(new WERD_CHOICE(*choice));
  }
-  else
+  alt_states = source.alt_states;
+  if (source.ep_choice != NULL) {
+    ep_choice = new WERD_CHOICE(*source.ep_choice);
+  } else {
    ep_choice = NULL;
+  }
  reject_map = source.reject_map;
  combination = source.combination;
  part_of_combo = source.part_of_combo;
  CopySimpleFields(source);
+  if (source.blamer_bundle != NULL) {
+    blamer_bundle =  new BlamerBundle(*(source.blamer_bundle));
+  }
  return *this;
 }

@ -211,54 +240,222 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) {
  small_caps = source.small_caps;
  italic = source.italic;
  bold = source.bold;
-  fontinfo_id = source.fontinfo_id;
+  fontinfo = source.fontinfo;
+  fontinfo2 = source.fontinfo2;
  fontinfo_id_count = source.fontinfo_id_count;
-  fontinfo_id2 = source.fontinfo_id2;
  fontinfo_id2_count = source.fontinfo_id2_count;
  x_height = source.x_height;
  caps_height = source.caps_height;
  guessed_x_ht = source.guessed_x_ht;
  guessed_caps_ht = source.guessed_caps_ht;
  reject_spaces = source.reject_spaces;
+  uch_set = source.uch_set;
+  tesseract = source.tesseract;
+}
+
+// Initializes a blank (default constructed) WERD_RES from one that has
+// already been recognized.
+// Use SetupFor*Recognition afterwards to complete the setup and make
+// it ready for a retry recognition.
+void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
+  word = source.word;
+  CopySimpleFields(source);
+  if (source.blamer_bundle != NULL) {
+    blamer_bundle = new BlamerBundle();
+    blamer_bundle->CopyTruth(*source.blamer_bundle);
+  }
 }

 // Sets up the members used in recognition:
 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
 // Returns false if the word is empty and sets up fake results.
-bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset,
-                                   bool numeric_mode, ROW *row, BLOCK* block) {
-  ClearResults();
-  if (word->cblob_list()->empty()) {
+bool WERD_RES::SetupForTessRecognition(const UNICHARSET& unicharset_in,
+                                   tesseract::Tesseract* tess, Pix* pix,
+                                   bool numeric_mode,
+                                   bool use_body_size,
+                                   ROW *row, BLOCK* block) {
+  tesseract = tess;
+  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
+  if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) {
    // Empty words occur when all the blobs have been moved to the rej_blobs
    // list, which seems to occur frequently in junk.
-    chopped_word = new TWERD;
-    rebuild_word = new TWERD;
-    bln_boxes = new tesseract::BoxWord;
-    box_word = new tesseract::BoxWord;
-    best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
-                                        TOP_CHOICE_PERM, unicharset);
-    raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
-                                       TOP_CHOICE_PERM, unicharset);
-    tess_failed = true;
+    SetupFake(unicharset_in);
+    word->set_flag(W_REP_CHAR, false);
    return false;
  }
+  ClearResults();
+  SetupWordScript(unicharset_in);
  chopped_word = TWERD::PolygonalCopy(word);
-  chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm);
+  if (use_body_size && row->body_size() > 0.0f) {
+    chopped_word->SetupBLNormalize(block, row, row->body_size(),
+                                   numeric_mode, &denorm);
+  } else {
+    chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm);
+  }
+  // The image will be 8-bit grey if the input was grey or color. Note that in
+  // a grey image 0 is black and 255 is white. If the input was binary, then
+  // the pix will be binary and 0 is white, with 1 being black.
+  // To tell the difference pixGetDepth() will return 8 or 1.
+  denorm.set_pix(pix);
+  // The inverse flag will be true iff the word has been determined to be white
+  // on black, and is independent of whether the pix is 8 bit or 1 bit.
+  denorm.set_inverse(word->flag(W_INVERSE));
  chopped_word->Normalize(denorm);
  bln_boxes = tesseract::BoxWord::CopyFromNormalized(NULL, chopped_word);
  seam_array = start_seam_list(chopped_word->blobs);
-  best_choice = new WERD_CHOICE;
+  best_choice = new WERD_CHOICE(&unicharset_in);
  best_choice->make_bad();
-  raw_choice = new WERD_CHOICE;
+  raw_choice = new WERD_CHOICE(&unicharset_in);
  raw_choice->make_bad();
+  SetupBlamerBundle();
  return true;
 }

+// Sets up the members used in recognition:
+// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
+// Returns false if the word is empty and sets up fake results.
+bool WERD_RES::SetupForCubeRecognition(const UNICHARSET& unicharset_in,
+                                       tesseract::Tesseract* tess,
+                                       const BLOCK* block) {
+  tesseract = tess;
+  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
+  if (pb != NULL && !pb->IsText()) {
+    // Ignore words in graphic regions.
+    SetupFake(unicharset_in);
+    word->set_flag(W_REP_CHAR, false);
+    return false;
+  }
+  ClearResults();
+  SetupWordScript(unicharset_in);
+  TBOX word_box = word->bounding_box();
+  denorm.SetupNormalization(block, NULL, NULL, NULL, NULL, 0,
+                            word_box.left(), word_box.bottom(),
+                            1.0f, 1.0f, 0.0f, 0.0f);
+  SetupBlamerBundle();
+  return true;
+}
+
+// Sets up the members used in recognition for an empty recognition result:
+// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
+void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
+  ClearResults();
+  SetupWordScript(unicharset_in);
+  chopped_word = new TWERD;
+  rebuild_word = new TWERD;
+  bln_boxes = new tesseract::BoxWord;
+  box_word = new tesseract::BoxWord;
+  int blob_count = word->cblob_list()->length();
+  best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
+                                TOP_CHOICE_PERM, unicharset_in);
+  raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
+                               TOP_CHOICE_PERM, unicharset_in);
+  if (blob_count > 0) {
+    BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
+    // For non-text blocks, just pass any blobs through to the box_word
+    // and call the word failed with a fake classification.
+    C_BLOB_IT b_it(word->cblob_list());
+    int blob_id = 0;
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      TBOX box = b_it.data()->bounding_box();
+      box_word->InsertBox(box_word->length(), box);
+      fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
+                                                -1, -1, -1, 0, 0, false);
+    }
+    FakeClassifyWord(blob_count, fake_choices);
+    delete [] fake_choices;
+  }
+  tess_failed = true;
+}
+
+void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
+  uch_set = &uch;
+  int script = uch.default_sid();
+  word->set_script_id(script);
+  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
+  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
+}
+
+// Sets up the blamer_bundle if it is not null, using the initialized denorm.
+void WERD_RES::SetupBlamerBundle() {
+  if (blamer_bundle != NULL) {
+    blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale();
+    TPOINT topleft;
+    TPOINT botright;
+    TPOINT norm_topleft;
+    TPOINT norm_botright;
+    for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) {
+      const TBOX &box = blamer_bundle->truth_word.BlobBox(b);
+      topleft.x = box.left();
+      topleft.y = box.top();
+      botright.x = box.right();
+      botright.y = box.bottom();
+      denorm.NormTransform(topleft, &norm_topleft);
+      denorm.NormTransform(botright, &norm_botright);
+      TBOX norm_box(norm_topleft.x, norm_botright.y,
+                    norm_botright.x, norm_topleft.y);
+      blamer_bundle->norm_truth_word.InsertBox(b, norm_box);
+    }
+  }
+}
+
+// Simple helper moves the ownership of the pointer data from src to dest,
+// first deleting anything in dest, and nulling out src afterwards.
+template<class T> static void MovePointerData(T** dest, T**src) {
+  delete *dest;
+  *dest = *src;
+  *src = NULL;
+}
+
+// Moves the results fields from word to this. This takes ownership of all
+// the data, so src can be destructed.
+void WERD_RES::ConsumeWordResults(WERD_RES* word) {
+  denorm = word->denorm;
+  MovePointerData(&chopped_word, &word->chopped_word);
+  MovePointerData(&rebuild_word, &word->rebuild_word);
+  MovePointerData(&box_word, &word->box_word);
+  if (seam_array != NULL)
+    free_seam_list(seam_array);
+  seam_array = word->seam_array;
+  word->seam_array = NULL;
+  best_state.move(&word->best_state);
+  correct_text.move(&word->correct_text);
+  MovePointerData(&best_choice, &word->best_choice);
+  MovePointerData(&raw_choice, &word->raw_choice);
+  alt_choices.delete_data_pointers();
+  alt_choices.move(&word->alt_choices);
+  alt_states.move(&word->alt_states);
+  reject_map = word->reject_map;
+  if (word->blamer_bundle != NULL) {
+    assert(blamer_bundle != NULL);
+    blamer_bundle->CopyResults(*(word->blamer_bundle));
+  }
+  CopySimpleFields(*word);
+}
+
+// Replace the best choice and rebuild box word.
+void WERD_RES::ReplaceBestChoice(
+    const WERD_CHOICE& choice,
+    const GenericVector<int>& segmentation_state) {
+  delete best_choice;
+  best_choice = new WERD_CHOICE(choice);
+  best_state = segmentation_state;
+  RebuildBestState();
+  SetupBoxWord();
+  // Make up a fake reject map of the right length to keep the
+  // rejection pass happy.
+  reject_map.initialise(segmentation_state.length());
+  done = tess_accepted = tess_would_adapt = true;
+  SetScriptPositions();
+}
+
 // Builds the rebuild_word from the chopped_word and the best_state.
 void WERD_RES::RebuildBestState() {
  if (rebuild_word != NULL)
    delete rebuild_word;
  rebuild_word = new TWERD;
+  if (seam_array == NULL) {
+    seam_array = start_seam_list(chopped_word->blobs);
+  }
  TBLOB* prev_blob = NULL;
  int start = 0;
  for (int i = 0; i < best_state.size(); ++i) {
@ -305,18 +502,51 @@ void WERD_RES::SetupBoxWord() {

 // Sets up the script positions in the output boxword using the best_choice
 // to get the unichars, and the unicharset to get the target positions.
-void WERD_RES::SetScriptPositions(const UNICHARSET& unicharset) {
-  box_word->SetScriptPositions(unicharset, small_caps, rebuild_word,
+void WERD_RES::SetScriptPositions() {
+  box_word->SetScriptPositions(*uch_set, small_caps, rebuild_word,
                               best_choice);
 }

+void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const {
+  int end = best_choice->length();
+  while (end > 0 &&
+         uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) &&
+         box_word->BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
+    end--;
+  }
+  int start = 0;
+  while (start < end &&
+         uch_set->get_isdigit(best_choice->unichar_ids()[start]) &&
+         box_word->BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
+    start++;
+  }
+  *pstart = start;
+  *pend = end;
+}
+
+void WERD_RES::WithoutFootnoteSpan(
+    const WERD_CHOICE &word, const GenericVector<int> &state,
+    int *pstart, int *pend) const {
+  int len = word.length();
+  *pstart = 0;
+  *pend = len;
+  if (len < 2) return;
+  if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) &&
+      !word.unicharset()->get_isdigit(word.unichar_ids()[0])) return;
+
+  // ok, now that we know the word ends in digits, do the expensive bit of
+  // figuring out if they're superscript.
+  WERD_RES copy(*this);
+  copy.ReplaceBestChoice(word, state);
+  copy.WithoutFootnoteSpan(pstart, pend);
+}
+
 // Classifies the word with some already-calculated BLOB_CHOICEs.
 // The choices are an array of blob_count pointers to BLOB_CHOICE,
 // providing a single classifier result for each blob.
 // The BLOB_CHOICEs are consumed and the word takes ownership.
 // The number of blobs in the outword must match blob_count.
-void WERD_RES::FakeClassifyWord(const UNICHARSET& unicharset, int blob_count,
-                                BLOB_CHOICE** choices) {
+void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
  // Setup the WERD_RES.
  ASSERT_HOST(box_word != NULL);
  ASSERT_HOST(blob_count == box_word->length());
@ -333,19 +563,19 @@ void WERD_RES::FakeClassifyWord(const UNICHARSET& unicharset, int blob_count,
    bc_it.add_after_then_move(choice_list);
  }
  best_choice->set_blob_choices(word_choices);
-  best_choice->populate_unichars(unicharset);
+  best_choice->populate_unichars();
  delete raw_choice;
  raw_choice = new WERD_CHOICE(*best_choice);
  reject_map.initialise(blob_count);
 }

 // Copies the best_choice strings to the correct_text for adaption/training.
-void WERD_RES::BestChoiceToCorrectText(const UNICHARSET& unicharset) {
+void WERD_RES::BestChoiceToCorrectText() {
  correct_text.clear();
  ASSERT_HOST(best_choice != NULL);
  for (int i = 0; i < best_choice->length(); ++i) {
    UNICHAR_ID choice_id = best_choice->unichar_id(i);
-    const char* blob_choice = unicharset.id_to_unichar(choice_id);
+    const char* blob_choice = uch_set->id_to_unichar(choice_id);
    correct_text.push_back(STRING(blob_choice));
  }
 }
@ -356,7 +586,6 @@ void WERD_RES::BestChoiceToCorrectText(const UNICHARSET& unicharset) {
 // result to the class returned from class_cb.
 // Returns true if anything was merged.
 bool WERD_RES::ConditionalBlobMerge(
-    const UNICHARSET& unicharset,
    TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
    TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,

@ -405,20 +634,153 @@ bool WERD_RES::ConditionalBlobMerge(
  delete class_cb;
  delete box_cb;
  if (modified) {
-    best_choice->populate_unichars(unicharset);
-    raw_choice->populate_unichars(unicharset);
+    best_choice->populate_unichars();
+    raw_choice->populate_unichars();
  }
  return modified;
 }

+// TODO(tkielbus) Decide between keeping this behavior here or modifying the
+// training data.
+
+// Utility function for fix_quotes
+// Return true if the next character in the string (given the UTF8 length in
+// bytes) is a quote character.
+static int is_simple_quote(const char* signed_str, int length) {
+  const unsigned char* str =
+      reinterpret_cast<const unsigned char*>(signed_str);
+  // Standard 1 byte quotes.
+  return (length == 1 && (*str == '\'' || *str == '`')) ||
+      // UTF-8 3 bytes curved quotes.
+      (length == 3 && ((*str == 0xe2 &&
+                        *(str + 1) == 0x80 &&
+                        *(str + 2) == 0x98) ||
+                       (*str == 0xe2 &&
+                        *(str + 1) == 0x80 &&
+                        *(str + 2) == 0x99)));
+}
+
+// Callback helper for fix_quotes returns a double quote if both
+// arguments are quote, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
+  const char *ch = uch_set->id_to_unichar(id1);
+  const char *next_ch = uch_set->id_to_unichar(id2);
+  if (is_simple_quote(ch, strlen(ch)) &&
+      is_simple_quote(next_ch, strlen(next_ch)))
+    return uch_set->unichar_to_id("\"");
+  return INVALID_UNICHAR_ID;
+}
+
+// Change pairs of quotes to double quotes.
+void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) {
+  if (!uch_set->contains_unichar("\"") ||
+      !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
+    return;  // Don't create it if it is disallowed.
+
+  ConditionalBlobMerge(
+      NewPermanentTessCallback(this, &WERD_RES::BothQuotes),
+      NULL,
+      blob_choices);
+}
+
+// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
+// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
+  const char *ch = uch_set->id_to_unichar(id1);
+  const char *next_ch = uch_set->id_to_unichar(id2);
+  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
+      (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
+    return uch_set->unichar_to_id("-");
+  return INVALID_UNICHAR_ID;
+}
+
+// Callback helper for fix_hyphens returns true if box1 and box2 overlap
+// (assuming both on the same textline, are in order and a chopped em dash.)
+bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
+  return box1.right() >= box2.left();
+}
+
+// Change pairs of hyphens to a single hyphen if the bounding boxes touch
+// Typically a long dash which has been segmented.
+void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) {
+  if (!uch_set->contains_unichar("-") ||
+      !uch_set->get_enabled(uch_set->unichar_to_id("-")))
+    return;  // Don't create it if it is disallowed.
+
+  ConditionalBlobMerge(
+      NewPermanentTessCallback(this, &WERD_RES::BothHyphens),
+      NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap),
+      blob_choices);
+}
+
+// Callback helper for merge_tess_fails returns a space if both
+// arguments are space, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
+  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
+    return id1;
+  else
+    return INVALID_UNICHAR_ID;
+}
+
+// Change pairs of tess failures to a single one
+void WERD_RES::merge_tess_fails() {
+  if (ConditionalBlobMerge(
+      NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL,
+      best_choice->blob_choices())) {
+    int len = best_choice->length();
+    ASSERT_HOST(reject_map.length() == len);
+    ASSERT_HOST(box_word->length() == len);
+  }
+}
+
+// Returns true if the collection of count pieces, starting at start, are all
+// natural connected components, ie there are no real chops involved.
+bool WERD_RES::PiecesAllNatural(int start, int count) const {
+  // all seams must have no splits.
+  for (int index = start; index < start + count - 1; ++index) {
+    if (index >= 0 && index < array_count(seam_array)) {
+      SEAM* seam = reinterpret_cast<SEAM *>(array_value(seam_array, index));
+      if (seam != NULL && seam->split1 != NULL)
+        return false;
+    }
+  }
+  return true;
+}
+

 WERD_RES::~WERD_RES () {
  Clear();
 }

+void WERD_RES::InitNonPointers() {
+  tess_failed = FALSE;
+  tess_accepted = FALSE;
+  tess_would_adapt = FALSE;
+  done = FALSE;
+  unlv_crunch_mode = CR_NONE;
+  small_caps = false;
+  italic = FALSE;
+  bold = FALSE;
+  // The fontinfos and tesseract count as non-pointers as they point to
+  // data owned elsewhere.
+  fontinfo = NULL;
+  fontinfo2 = NULL;
+  tesseract = NULL;
+  fontinfo_id_count = 0;
+  fontinfo_id2_count = 0;
+  x_height = 0.0;
+  caps_height = 0.0;
+  guessed_x_ht = TRUE;
+  guessed_caps_ht = TRUE;
+  combination = FALSE;
+  part_of_combo = FALSE;
+  reject_spaces = FALSE;
+}
+
 void WERD_RES::InitPointers() {
  word = NULL;
  bln_boxes = NULL;
+  uch_set = NULL;
  chopped_word = NULL;
  rebuild_word = NULL;
  box_word = NULL;
@ -426,17 +788,25 @@ void WERD_RES::InitPointers() {
  best_choice = NULL;
  raw_choice = NULL;
  ep_choice = NULL;
+  blamer_bundle = NULL;
 }

 void WERD_RES::Clear() {
-  if (word != NULL && combination)
+  if (word != NULL && combination) {
    delete word;
+  }
  word = NULL;
+  delete blamer_bundle;
+  blamer_bundle = NULL;
  ClearResults();
 }

 void WERD_RES::ClearResults() {
  done = false;
+  fontinfo = NULL;
+  fontinfo2 = NULL;
+  fontinfo_id_count = 0;
+  fontinfo_id2_count = 0;
  if (bln_boxes != NULL) {
    delete bln_boxes;
    bln_boxes = NULL;
@ -465,18 +835,93 @@ void WERD_RES::ClearResults() {
    best_choice = NULL;
    raw_choice = NULL;
  }
+  if (!alt_choices.empty()) {
+    alt_choices.delete_data_pointers();
+    alt_choices.clear();
+  }
+  alt_states.clear();
  if (ep_choice != NULL) {
    delete ep_choice;
    ep_choice = NULL;
  }
+  if (blamer_bundle != NULL) blamer_bundle->ClearResults();
 }

+bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
+  return word_res == other.word_res &&
+      row_res == other.row_res &&
+      block_res == other.block_res;
+}
+
+int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
+  ASSERT_HOST(page_res == other.page_res);
+  if (other.block_res == NULL) {
+    // other points to the end of the page.
+    if (block_res == NULL)
+      return 0;
+    return -1;
+  }
+  if (block_res == NULL) {
+    return 1; // we point to the end of the page.
+  }
+  if (block_res == other.block_res) {
+    if (other.row_res == NULL || row_res == NULL) {
+      // this should only happen if we hit an image block.
+      return 0;
+    }
+    if (row_res == other.row_res) {
+      // we point to the same block and row.
+      ASSERT_HOST(other.word_res != NULL && word_res != NULL);
+      if (word_res == other.word_res) {
+        // we point to the same word!
+        return 0;
+      }
+
+      WERD_RES_IT word_res_it(&row_res->word_res_list);
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+           word_res_it.forward()) {
+        if (word_res_it.data() == word_res) {
+          return -1;
+        } else if (word_res_it.data() == other.word_res) {
+          return 1;
+        }
+      }
+      ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
+    }
+
+    // we both point to the same block, but different rows.
+    ROW_RES_IT row_res_it(&block_res->row_res_list);
+    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
+         row_res_it.forward()) {
+      if (row_res_it.data() == row_res) {
+        return -1;
+      } else if (row_res_it.data() == other.row_res) {
+        return 1;
+      }
+    }
+    ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
+  }
+
+  // We point to different blocks.
+  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
+  for (block_res_it.mark_cycle_pt();
+       !block_res_it.cycled_list(); block_res_it.forward()) {
+    if (block_res_it.data() == block_res) {
+      return -1;
+    } else if (block_res_it.data() == other.block_res) {
+      return 1;
+    }
+  }
+  // Shouldn't happen...
+  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
+  return 0;
+}

 // Inserts the new_word and a corresponding WERD_RES before the current
 // position. The simple fields of the WERD_RES are copied from clone_res and
 // the resulting WERD_RES is returned for further setup with best_choice etc.
-WERD_RES* PAGE_RES_IT::InsertCloneWord(const WERD_RES& clone_res,
-                                       WERD* new_word) {
+WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
+                                             WERD* new_word) {
  // Insert new_word into the ROW.
  WERD_IT w_it(row()->row->word_list());
  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
@ -652,6 +1097,34 @@ WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
  return word_res;
 }

+/*************************************************************************
+ * PAGE_RES_IT::restart_row()
+ *
+ * Move to the beginning (leftmost word) of the current row.
+ *************************************************************************/
+WERD_RES *PAGE_RES_IT::restart_row() {
+  ROW_RES *row = this->row();
+  if (!row) return NULL;
+  for (restart_page(); this->row() != row; forward()) {
+    // pass
+  }
+  return word();
+}
+
+/*************************************************************************
+ * PAGE_RES_IT::forward_paragraph
+ *
+ * Move to the beginning of the next paragraph, allowing empty blocks.
+ *************************************************************************/
+
+WERD_RES *PAGE_RES_IT::forward_paragraph() {
+  while (block_res == next_block_res &&
+         (next_row_res != NULL && next_row_res->row != NULL &&
+          row_res->row->para() == next_row_res->row->para())) {
+    internal_forward(false, true);
+  }
+  return internal_forward(false, true);
+}

 /*************************************************************************
 * PAGE_RES_IT::forward_block
@ -666,7 +1139,6 @@ WERD_RES *PAGE_RES_IT::forward_block() {
  return internal_forward(false, true);
 }

-
 void PAGE_RES_IT::rej_stat_word() {
  inT16 chars_in_word;
  inT16 rejects_in_word = 0;
--- a/ccstruct/pageres.h
+++ b/ccstruct/pageres.h
@ -26,11 +26,179 @@
 #include "normalis.h"
 #include "ocrblock.h"
 #include "ocrrow.h"
+#include "params_training_featdef.h"
 #include "ratngs.h"
 #include "rejctmap.h"
 #include "seam.h"
 #include "werd.h"

+namespace tesseract {
+struct FontInfo;
+class Tesseract;
+}
+using tesseract::FontInfo;
+
+static const inT16 kBlamerBoxTolerance = 5;
+
+// Enum for expressing the source of error.
+// Note: Please update kIncorrectResultReasonNames when modifying this enum.
+enum IncorrectResultReason {
+  // The text recorded in best choice == truth text
+  IRR_CORRECT,
+  // Either: Top choice is incorrect and is a dictionary word (language model
+  // is unlikely to help correct such errors, so blame the classifier).
+  // Or: the correct unichar was not included in shortlist produced by the
+  // classifier at all.
+  IRR_CLASSIFIER,
+  // Chopper have not found one or more splits that correspond to the correct
+  // character bounding boxes recorded in BlamerBundle::truth_word.
+  IRR_CHOPPER,
+  // Classifier did include correct unichars for each blob in the correct
+  // segmentation, however its rating could have been too bad to allow the
+  // language model to pull out the correct choice. On the other hand the
+  // strength of the language model might have been too weak to favor the
+  // correct answer, this we call this case a classifier-language model
+  // tradeoff error.
+  IRR_CLASS_LM_TRADEOFF,
+  // Page layout failed to produce the correct bounding box. Blame page layout
+  // if the truth was not found for the word, which implies that the bounding
+  // box of the word was incorrect (no truth word had a similar bounding box).
+  IRR_PAGE_LAYOUT,
+  // SegSearch heuristic prevented one or more blobs from the correct
+  // segmentation state to be classified (e.g. the blob was too wide).
+  IRR_SEGSEARCH_HEUR,
+  // The correct segmentaiton state was not explored because of poor SegSearch
+  // pain point prioritization. We blame SegSearch pain point prioritization
+  // if the best rating of a choice constructed from correct segmentation is
+  // better than that of the best choice (i.e. if we got to explore the correct
+  // segmentation state, language model would have picked the correct choice).
+  IRR_SEGSEARCH_PP,
+  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
+
+  // and thus use the old language model (permuters).
+  // TODO(antonova): integrate the new language mode with chopper
+  IRR_CLASS_OLD_LM_TRADEOFF,
+  // If there is an incorrect adaptive template match with a better score than
+  // a correct one (either pre-trained or adapted), mark this as adaption error.
+  IRR_ADAPTION,
+  // split_and_recog_word() failed to find a suitable split in truth.
+  IRR_NO_TRUTH_SPLIT,
+  // Truth is not available for this word (e.g. when words in corrected content
+  // file are turned into ~~~~ because an appropriate alignment was not found.
+  IRR_NO_TRUTH,
+  // The text recorded in best choice != truth text, but none of the above
+  // reasons are set.
+  IRR_UNKNOWN,
+
+  IRR_NUM_REASONS
+};
+
+// Blamer-related information to determine the source of errors.
+struct BlamerBundle {
+  static const char *IncorrectReasonName(IncorrectResultReason irr);
+  BlamerBundle() : truth_has_char_boxes(false),
+      incorrect_result_reason(IRR_CORRECT),
+      lattice_data(NULL) { ClearResults(); }
+  ~BlamerBundle() { delete[] lattice_data; }
+  void ClearResults() {
+    norm_truth_word.DeleteAllBoxes();
+    norm_box_tolerance = 0;
+    if (!NoTruth()) incorrect_result_reason = IRR_CORRECT;
+    debug = "";
+    segsearch_is_looking_for_blame = false;
+    best_correctly_segmented_rating = WERD_CHOICE::kBadRating;
+    correct_segmentation_cols.clear();
+    correct_segmentation_rows.clear();
+    best_choice_is_dict_and_top_choice = false;
+    delete[] lattice_data;
+    lattice_data = NULL;
+    lattice_size = 0;
+  }
+  void CopyTruth(const BlamerBundle &other) {
+    truth_has_char_boxes = other.truth_has_char_boxes;
+    truth_word = other.truth_word;
+    truth_text = other.truth_text;
+    incorrect_result_reason =
+        (other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT);
+  }
+  void CopyResults(const BlamerBundle &other) {
+    norm_truth_word = other.norm_truth_word;
+    norm_box_tolerance = other.norm_box_tolerance;
+    incorrect_result_reason = other.incorrect_result_reason;
+    segsearch_is_looking_for_blame = other.segsearch_is_looking_for_blame;
+    best_correctly_segmented_rating =other.best_correctly_segmented_rating;
+    correct_segmentation_cols = other.correct_segmentation_cols;
+    correct_segmentation_rows = other.correct_segmentation_rows;
+    best_choice_is_dict_and_top_choice =
+        other.best_choice_is_dict_and_top_choice;
+    if (other.lattice_data != NULL) {
+      lattice_data = new char[other.lattice_size];
+      memcpy(lattice_data, other.lattice_data, other.lattice_size);
+      lattice_size = other.lattice_size;
+    } else {
+      lattice_data = NULL;
+    }
+  }
+  BlamerBundle(const BlamerBundle &other) {
+    this->CopyTruth(other);
+    this->CopyResults(other);
+  }
+  const char *IncorrectReason() const;
+  bool NoTruth() const {
+    return (incorrect_result_reason == IRR_NO_TRUTH ||
+             incorrect_result_reason == IRR_PAGE_LAYOUT);
+  }
+  void SetBlame(IncorrectResultReason irr,
+                const STRING &msg, const WERD_CHOICE *choice, bool debug) {
+    this->incorrect_result_reason = irr;
+    this->debug = this->IncorrectReason();
+    this->debug += " to blame: ";
+    this->FillDebugString(msg, choice, &(this->debug));
+    if (debug) tprintf("SetBlame(): %s", this->debug.string());
+  }
+  // Appends choice and truth details to the given debug string.
+  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
+                       STRING *debug);
+
+  // Set to true when bounding boxes for individual unichars are recorded.
+  bool truth_has_char_boxes;
+  // The true_word (in the original image coordinate space) contains ground
+  // truth bounding boxes for this WERD_RES.
+  tesseract::BoxWord truth_word;
+  // Same as above, but in normalized coordinates
+  // (filled in by WERD_RES::SetupForRecognition()).
+  tesseract::BoxWord norm_truth_word;
+  // Tolerance for bounding box comparisons in normalized space.
+  int norm_box_tolerance;
+  // Contains ground truth unichar for each of the bounding boxes in truth_word.
+  GenericVector<STRING> truth_text;
+  // The reason for incorrect OCR result.
+  IncorrectResultReason incorrect_result_reason;
+  // Debug text associated with the blame.
+  STRING debug;
+  // Misadaption debug information (filled in if this word was misadapted to).
+  STRING misadaption_debug;
+  // Variables used by the segmentation search when looking for the blame.
+  // Set to true while segmentation search is continued after the usual
+  // termination condition in order to look for the blame.
+  bool segsearch_is_looking_for_blame;
+  // Best rating for correctly segmented path
+  // (set and used by SegSearch when looking for blame).
+  float best_correctly_segmented_rating;
+  // Vectors populated by SegSearch to indicate column and row indices that
+  // correspond to blobs with correct bounding boxes.
+  GenericVector<int> correct_segmentation_cols;
+  GenericVector<int> correct_segmentation_rows;
+  // Set to true if best choice is a dictionary word and
+  // classifier's top choice.
+  bool best_choice_is_dict_and_top_choice;
+  // Serialized segmentation search lattice.
+  char *lattice_data;
+  int lattice_size;  // size of lattice_data in bytes
+  // Information about hypotheses (paths) explored by the segmentation search.
+  tesseract::ParamsTrainingBundle params_training_bundle;
+};
+
 /* Forward declarations */

 class BLOCK_RES;
@ -56,9 +224,23 @@ class PAGE_RES {                 // page result
  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
  // the next word. This pointer is not owned by PAGE_RES class.
  WERD_CHOICE **prev_word_best_choice;
+  // Sums of blame reasons computed by the blamer.
+  GenericVector<int> blame_reasons;
+  // Debug information about all the misadaptions on this page.
+  // Each BlamerBundle contains an index into this vector, so that words that
+  // caused misadaption could be marked. However, since words could be
+  // deleted/split/merged, the log is stored on the PAGE_RES level.
+  GenericVector<STRING> misadaption_log;

-  PAGE_RES() {
-  }                            // empty constructor
+  inline void Init() {
+    char_count = 0;
+    rej_count = 0;
+    rejected = FALSE;
+    prev_word_best_choice = NULL;
+    blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
+  }
+
+  PAGE_RES() { Init(); }  // empty constructor

  PAGE_RES(BLOCK_LIST *block_list,   // real blocks
           WERD_CHOICE **prev_word_best_choice_ptr);
@ -110,7 +292,7 @@ class ROW_RES:public ELIST_LINK {
  ROW_RES() {
  }                            // empty constructor

-  ROW_RES(bool right_to_left, ROW *the_row);  // real row
+  ROW_RES(ROW *the_row);  // real row

  ~ROW_RES() {                // destructor
  }
@ -142,29 +324,61 @@ class WERD_RES : public ELIST_LINK {
  // In any case a rotation by denorm.block()->re_rotation() will take them
  // back to the original image.
  // The other differences between words all represent different stages of
-  // processing.
-  //
+  // processing during recognition.
+
+  // ---------------------------INPUT-------------------------------------
+
  // The word is the input C_BLOBs in the rotated pixel space.
  // word is NOT owned by the WERD_RES unless combination is true.
  // All the other word pointers ARE owned by the WERD_RES.
  WERD* word;                     // Input C_BLOB word.
+
+  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
+
  // The bln_boxes contains the bounding boxes (only) of the input word, in the
  // BLN space. The lengths of word and bln_boxes
  // match as they are both before any chopping.
  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
  // if it doesn't.
  tesseract::BoxWord* bln_boxes;  // BLN input bounding boxes.
+  // The denorm provides the transformation to get back to the rotated image
+  // coords from the chopped_word/rebuild_word BLN coords.
+  DENORM denorm;                  // For use on chopped_word.
+  // Unicharset used by the classifier output in best_choice and raw_choice.
+  const UNICHARSET* uch_set;  // For converting back to utf8.
+
+  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
+  // ----Setup to a (different!) state expected by the various classifiers----
+  // TODO(rays) Tidy and make more consistent.
+
  // The chopped_word is also in BLN space, and represents the fully chopped
  // character fragments that make up the word.
  // The length of chopped_word matches length of seam_array + 1 (if set).
  TWERD* chopped_word;            // BLN chopped fragments output.
  SEAMS seam_array;               // Seams matching chopped_word.
+  WERD_CHOICE *best_choice;       // tess output
+  WERD_CHOICE *raw_choice;        // top choice permuter
+  // Alternative paths found during chopping/segmentation search stages
+  // (the first entry being a slim copy of best_choice).
+  GenericVector<WERD_CHOICE *> alt_choices;
+  GenericVector<GenericVector<int> > alt_states;
+
+  // Truth bounding boxes, text and incorrect choice reason.
+  BlamerBundle *blamer_bundle;
+
+  // --------------OUTPUT FROM RECOGNITION-------------------------------
+  // --------------Not all fields are necessarily set.-------------------
+  // ---best_choice, raw_choice *must* end up set, with a box_word-------
+  // ---In complete output, the number of blobs in rebuild_word matches---
+  // ---the number of boxes in box_word, the number of unichar_ids in---
+  // ---best_choice, the number of ints in best_state, and the number---
+  // ---of strings in correct_text--------------------------------------
+  // ---SetupFake Sets everything to appropriate values if the word is---
+  // ---known to be bad before recognition.------------------------------
+
  // The rebuild_word is also in BLN space, but represents the final best
  // segmentation of the word. Its length is therefore the same as box_word.
  TWERD* rebuild_word;            // BLN best segmented word.
-  // The denorm provides the transformation to get back to the rotated image
-  // coords from the chopped_word/rebuild_word BLN coords.
-  DENORM denorm;                  // For use on chopped_word.
  // The box_word is in the original image coordinate space. It is the
  // bounding boxes of the rebuild_word, after denormalization.
  // The length of box_word matches rebuild_word, best_state (if set) and
@ -180,16 +394,16 @@ class WERD_RES : public ELIST_LINK {
  // text to the training system without the need for a unicharset. There
  // is one entry in the vector for each blob in rebuild_word and box_word.
  GenericVector<STRING> correct_text;
-  // The truth_* fields below are used by the blamer to determine the source
-  // of errors.
-  // The truth_word (in the original image coordinate space) contains ground
-  // truth bounding boxes for this WERD_RES.
-  tesseract::BoxWord* truth_word;
-  // The truth_text contains ground truth unichar for each
-  // of the bounding boxes in truth_word.
-  GenericVector<STRING> truth_text;
-  WERD_CHOICE *best_choice;    // tess output
-  WERD_CHOICE *raw_choice;     // top choice permuter
+  // The Tesseract that was used to recognize this word. Just a borrowed
+  // pointer. Note: Tesseract's class definition is in a higher-level library.
+  // We avoid introducing a cyclic dependency by not using the Tesseract
+  // within WERD_RES. We are just storing it to provide access to it
+  // for the top-level multi-language controller, and maybe for output of
+  // the recognized language.
+  tesseract::Tesseract* tesseract;
+
+  // Less-well documented members.
+  // TODO(rays) Add more documentation here.
  WERD_CHOICE *ep_choice;      // ep text TODO(rays) delete this.
  REJMAP reject_map;           // best_choice rejects
  BOOL8 tess_failed;
@ -206,15 +420,17 @@ class WERD_RES : public ELIST_LINK {
  bool small_caps;             // word appears to be small caps
  inT8 italic;
  inT8 bold;
-  inT16 fontinfo_id;            // primary font id (should be at least inT16)
+  // The fontinfos are pointers to data owned by the classifier.
+  const FontInfo* fontinfo;
+  const FontInfo* fontinfo2;
  inT8 fontinfo_id_count;       // number of votes
-  inT16 fontinfo_id2;           // secondary font id (should be at least inT16)
  inT8 fontinfo_id2_count;      // number of votes
+  BOOL8 guessed_x_ht;
+  BOOL8 guessed_caps_ht;
  CRUNCH_MODE unlv_crunch_mode;
  float x_height;              // post match estimate
  float caps_height;           // post match estimate
-  BOOL8 guessed_x_ht;
-  BOOL8 guessed_caps_ht;
+
  /*
    To deal with fuzzy spaces we need to be able to combine "words" to form
    combinations when we suspect that the gap is a non-space. The (new) text
@ -238,31 +454,13 @@ class WERD_RES : public ELIST_LINK {
  GenericVector<inT8> best_choice_fontinfo_ids;

  WERD_RES() {
+    InitNonPointers();
    InitPointers();
  }
-  WERD_RES(                   //simple constructor
-           WERD *the_word) {  //real word
+  WERD_RES(WERD *the_word) {
+    InitNonPointers();
    InitPointers();
    word = the_word;
-    tess_failed = FALSE;
-    tess_accepted = FALSE;
-    tess_would_adapt = FALSE;
-    done = FALSE;
-    unlv_crunch_mode = CR_NONE;
-    small_caps = false;
-    italic = FALSE;
-    bold = FALSE;
-    fontinfo_id = -1;
-    fontinfo_id_count = 0;
-    fontinfo_id2 = -1;
-    fontinfo_id2_count = 0;
-    x_height = 0.0;
-    caps_height = 0.0;
-    guessed_x_ht = TRUE;
-    guessed_caps_ht = TRUE;
-    combination = FALSE;
-    part_of_combo = FALSE;
-    reject_spaces = FALSE;
  }
  WERD_RES(const WERD_RES &source) {
    InitPointers();
@ -270,6 +468,80 @@ class WERD_RES : public ELIST_LINK {
  }

  ~WERD_RES();
+
+  // Returns the UTF-8 string for the given blob index in the best_choice word,
+  // given that we know whether we are in a right-to-left reading context.
+  // This matters for mirrorable characters such as parentheses.  We recognize
+  // characters purely based on their shape on the page, and by default produce
+  // the corresponding unicode for a left-to-right context.
+  const char* const BestUTF8(int blob_index, bool in_rtl_context) const {
+    if (blob_index < 0 || blob_index >= best_choice->length())
+      return NULL;
+    UNICHAR_ID id = best_choice->unichar_id(blob_index);
+    if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
+      return NULL;
+    UNICHAR_ID mirrored = uch_set->get_mirror(id);
+    if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID)
+      id = mirrored;
+    return uch_set->id_to_unichar_ext(id);
+  }
+  // Returns the UTF-8 string for the given blob index in the raw_choice word.
+  const char* const RawUTF8(int blob_index) const {
+    if (blob_index < 0 || blob_index >= raw_choice->length())
+      return NULL;
+    UNICHAR_ID id = raw_choice->unichar_id(blob_index);
+    if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
+      return NULL;
+    return uch_set->id_to_unichar(id);
+  }
+
+  UNICHARSET::Direction SymbolDirection(int blob_index) const {
+    if (best_choice == NULL ||
+        blob_index >= best_choice->length() ||
+        blob_index < 0)
+      return UNICHARSET::U_OTHER_NEUTRAL;
+    return uch_set->get_direction(best_choice->unichar_id(blob_index));
+  }
+
+  bool AnyRtlCharsInWord() const {
+    if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
+      return false;
+    for (int id = 0; id < best_choice->length(); id++) {
+      int unichar_id = best_choice->unichar_id(id);
+      if (unichar_id < 0 || unichar_id >= uch_set->size())
+        continue;  // Ignore illegal chars.
+      UNICHARSET::Direction dir =
+          uch_set->get_direction(unichar_id);
+      if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
+          dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
+          dir == UNICHARSET::U_ARABIC_NUMBER)
+        return true;
+    }
+    return false;
+  }
+
+  bool AnyLtrCharsInWord() const {
+    if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
+      return false;
+    for (int id = 0; id < best_choice->length(); id++) {
+      int unichar_id = best_choice->unichar_id(id);
+      if (unichar_id < 0 || unichar_id >= uch_set->size())
+        continue;  // Ignore illegal chars.
+      UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
+      if (dir == UNICHARSET::U_LEFT_TO_RIGHT)
+        return true;
+    }
+    return false;
+  }
+
+  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
+  // that gave us the unichars in reading order (as opposed to strict left
+  // to right).
+  bool UnicharsInReadingOrder() const {
+    return best_choice->unichars_in_script_order();
+  }
+
+  void InitNonPointers();
  void InitPointers();
  void Clear();
  void ClearResults();
@ -278,11 +550,55 @@ class WERD_RES : public ELIST_LINK {

  void CopySimpleFields(const WERD_RES& source);

+  // Initializes a blank (default constructed) WERD_RES from one that has
+  // already been recognized.
+  // Use SetupFor*Recognition afterwards to complete the setup and make
+  // it ready for a retry recognition.
+  void InitForRetryRecognition(const WERD_RES& source);
+
+  // Sets up the members used in recognition: bln_boxes, chopped_word,
+  // seam_array, denorm, best_choice, raw_choice.  Returns false if
+  // the word is empty and sets up fake results.  If use_body_size is
+  // true and row->body_size is set, then body_size will be used for
+  // blob normalization instead of xheight + ascrise. This flag is for
+  // those languages that are using CJK pitch model and thus it has to
+  // be true if and only if tesseract->textord_use_cjk_fp_model is
+  // true.
+  bool SetupForTessRecognition(const UNICHARSET& unicharset_in,
+                               tesseract::Tesseract* tesseract, Pix* pix,
+                               bool numeric_mode, bool use_body_size,
+                               ROW *row, BLOCK* block);
+
  // Sets up the members used in recognition:
-  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
+  // bln_boxes, chopped_word, seam_array, denorm.
  // Returns false if the word is empty and sets up fake results.
-  bool SetupForRecognition(const UNICHARSET& unicharset,
-                           bool numeric_mode, ROW *row, BLOCK* block);
+  bool SetupForCubeRecognition(const UNICHARSET& unicharset_in,
+                               tesseract::Tesseract* tesseract,
+                               const BLOCK* block);
+
+  // Sets up the members used in recognition for an empty recognition result:
+  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
+  void SetupFake(const UNICHARSET& uch);
+
+  // Set the word as having the script of the input unicharset.
+  void SetupWordScript(const UNICHARSET& unicharset_in);
+
+  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
+  void SetupBlamerBundle();
+
+  // Moves the results fields from word to this. This takes ownership of all
+  // the data, so src can be destructed.
+  // word1.ConsumeWordResult(word);
+  // delete word;
+  // is simpler and faster than:
+  // word1 = *word;
+  // delete word;
+  // as it doesn't need to copy and reallocate anything.
+  void ConsumeWordResults(WERD_RES* word);
+
+  // Replace the best choice and rebuild box word.
+  void ReplaceBestChoice(const WERD_CHOICE& choice,
+                         const GenericVector<int> &segmentation_state);

  // Builds the rebuild_word from the chopped_word and the best_state.
  void RebuildBestState();
@ -296,18 +612,30 @@ class WERD_RES : public ELIST_LINK {

  // Sets up the script positions in the output boxword using the best_choice
  // to get the unichars, and the unicharset to get the target positions.
-  void SetScriptPositions(const UNICHARSET& unicharset);
+  void SetScriptPositions();
+
+  // Returns the indices [start, end) containing the core of the word, stripped
+  // of any superscript digits on either side.
+  // (i.e., the non-footnote part of the word).
+  // Assumes that BoxWord is all set up for best_choice.
+  void WithoutFootnoteSpan(int *start, int *end) const;
+
+  // Given an alternate word choice and segmentation state, yield the indices
+  // [start, end) containig the core of the word, stripped of any superscript
+  // digits on either side.  (i.e. stripping off the footnote parts).
+  void WithoutFootnoteSpan(
+      const WERD_CHOICE &choice, const GenericVector<int> &state,
+      int *start, int *end) const;

  // Classifies the word with some already-calculated BLOB_CHOICEs.
  // The choices are an array of blob_count pointers to BLOB_CHOICE,
  // providing a single classifier result for each blob.
  // The BLOB_CHOICEs are consumed and the word takes ownership.
  // The number of blobs in the outword must match blob_count.
-  void FakeClassifyWord(const UNICHARSET& unicharset, int blob_count,
-                        BLOB_CHOICE** choices);
+  void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);

  // Copies the best_choice strings to the correct_text for adaption/training.
-  void BestChoiceToCorrectText(const UNICHARSET& unicharset);
+  void BestChoiceToCorrectText();

  // Merges 2 adjacent blobs in the result if the permanent callback
  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
@ -315,11 +643,28 @@ class WERD_RES : public ELIST_LINK {
  // result to the class returned from class_cb.
  // Returns true if anything was merged.
  bool ConditionalBlobMerge(
-      const UNICHARSET& unicharset,
      TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
      TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
      BLOB_CHOICE_LIST_CLIST *blob_choices);

+  // Callback helper for fix_quotes returns a double quote if both
+  // arguments are quote, otherwise INVALID_UNICHAR_ID.
+  UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
+  void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices);
+
+  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
+  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
+  UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
+  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
+  // (assuming both on the same textline, are in order and a chopped em dash.)
+  bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
+  void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices);
+
+  // Callback helper for merge_tess_fails returns a space if both
+  // arguments are space, otherwise INVALID_UNICHAR_ID.
+  UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
+  void merge_tess_fails();
+
  static WERD_RES* deep_copy(const WERD_RES* src) {
    return new WERD_RES(*src);
  }
@ -331,6 +676,10 @@ class WERD_RES : public ELIST_LINK {
    word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
    word->copy_on(word_res->word);
  }
+
+  // Returns true if the collection of count pieces, starting at start, are all
+  // natural connected components, ie there are no real chops involved.
+  bool PiecesAllNatural(int start, int count) const;
 };

 /*************************************************************************
@ -349,6 +698,18 @@ class PAGE_RES_IT {
    restart_page();  // ready to scan
  }

+  // Do two PAGE_RES_ITs point at the same word?
+  // This is much cheaper than cmp().
+  bool operator ==(const PAGE_RES_IT &other) const;
+
+  bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
+
+  // Given another PAGE_RES_IT to the same page,
+  //  this before other:     -1
+  //  this equal to other:    0
+  //  this later than other:  1
+  int cmp(const PAGE_RES_IT &other) const;
+
  WERD_RES *restart_page() {
    return start_page(false);  // Skip empty blocks.
  }
@ -357,6 +718,8 @@ class PAGE_RES_IT {
  }
  WERD_RES *start_page(bool empty_ok);

+  WERD_RES *restart_row();
+
  // ============ Methods that mutate the underling structures ===========
  // Note that these methods will potentially invalidate other PAGE_RES_ITs
  // and are intended to be used only while a single PAGE_RES_IT is  active.
@ -366,7 +729,7 @@ class PAGE_RES_IT {
  // Inserts the new_word and a corresponding WERD_RES before the current
  // position. The simple fields of the WERD_RES are copied from clone_res and
  // the resulting WERD_RES is returned for further setup with best_choice etc.
-  WERD_RES* InsertCloneWord(const WERD_RES& clone_res, WERD* new_word);
+  WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);

  // Deletes the current WERD_RES and its underlying WERD.
  void DeleteCurrentWord();
@ -379,8 +742,9 @@ class PAGE_RES_IT {
    return internal_forward(false, true);
  }

-  WERD_RES *forward_block();  // get first word in
-  // next non-empty block
+  WERD_RES *forward_paragraph();  // get first word in next non-empty paragraph
+  WERD_RES *forward_block();  // get first word in next non-empty block
+
  WERD_RES *prev_word() const {  // previous word
    return prev_word_res;
  }
--- a/ccstruct/pdblock.h
+++ b/ccstruct/pdblock.h
@ -59,7 +59,7 @@ class PDBLK
      if (hand_poly) delete hand_poly;
    }

-    POLY_BLOCK *poly_block() {
+    POLY_BLOCK *poly_block() const {
      return hand_poly;
    }
    ///set the poly block
--- a/ccstruct/points.cpp
+++ b/ccstruct/points.cpp
@ -19,6 +19,7 @@

 #include          "mfcpch.h"     //precompiled headers
 #include          <stdlib.h>
+#include          "helpers.h"
 #include          "ndminx.h"
 #include          "serialis.h"
 #include          "points.h"
@ -55,6 +56,24 @@ static int sign(int x) {
    return x > 0 ? 1 : 0;
 }

+// Writes to the given file. Returns false in case of error.
+bool ICOORD::Serialize(FILE* fp) const {
+  if (fwrite(&xcoord, sizeof(xcoord), 1, fp) != 1) return false;
+  if (fwrite(&ycoord, sizeof(ycoord), 1, fp) != 1) return false;
+  return true;
+}
+// Reads from the given file. Returns false in case of error.
+// If swap is true, assumes a big/little-endian swap is needed.
+bool ICOORD::DeSerialize(bool swap, FILE* fp) {
+  if (fread(&xcoord, sizeof(xcoord), 1, fp) != 1) return false;
+  if (fread(&ycoord, sizeof(ycoord), 1, fp) != 1) return false;
+  if (swap) {
+    ReverseN(&xcoord, sizeof(xcoord));
+    ReverseN(&ycoord, sizeof(ycoord));
+  }
+  return true;
+}
+
 // Setup for iterating over the pixels in a vector by the well-known
 // Bresenham rendering algorithm.
 // Starting with major/2 in the accumulator, on each step add major_step,
--- a/ccstruct/points.h
+++ b/ccstruct/points.h
@ -99,11 +99,11 @@ class ICOORD
    }

    ///test equality
-    BOOL8 operator== (const ICOORD & other) {
+    BOOL8 operator== (const ICOORD & other) const {
      return xcoord == other.xcoord && ycoord == other.ycoord;
    }
    ///test inequality
-    BOOL8 operator!= (const ICOORD & other) {
+    BOOL8 operator!= (const ICOORD & other) const {
      return xcoord != other.xcoord || ycoord != other.ycoord;
    }
    ///rotate 90 deg anti
@ -147,6 +147,12 @@ class ICOORD
    void setup_render(ICOORD* major_step, ICOORD* minor_step,
                      int* major, int* minor) const;

+    // Writes to the given file. Returns false in case of error.
+    bool Serialize(FILE* fp) const;
+    // Reads from the given file. Returns false in case of error.
+    // If swap is true, assumes a big/little-endian swap is needed.
+    bool DeSerialize(bool swap, FILE* fp);
+
  protected:
    inT16 xcoord;                //< x value
    inT16 ycoord;                //< y value
--- a/ccstruct/polyblk.cpp
+++ b/ccstruct/polyblk.cpp
@ -52,10 +52,9 @@ POLY_BLOCK::POLY_BLOCK(const TBOX& box, PolyBlockType t) {
  ICOORDELT_IT v = &vertices;
  v.move_to_first();
  v.add_to_end(new ICOORDELT(box.left(), box.top()));
-  v.add_to_end(new ICOORDELT(box.left(), box.top() + box.height()));
-  v.add_to_end(new ICOORDELT(box.left() + box.width(),
-                             box.top() + box.height()));
-  v.add_to_end(new ICOORDELT(box.left(), box.top() + box.height()));
+  v.add_to_end(new ICOORDELT(box.left(), box.bottom()));
+  v.add_to_end(new ICOORDELT(box.right(), box.bottom()));
+  v.add_to_end(new ICOORDELT(box.right(), box.top()));
  compute_bb();
  type = t;
 }
@ -204,6 +203,25 @@ void POLY_BLOCK::rotate(FCOORD rotation) {
  compute_bb();
 }

+/**
+ * @name POLY_BLOCK::reflect_in_y_axis
+ *
+ * Reflect the coords of the polygon in the y-axis. (Flip the sign of x.)
+ */
+
+void POLY_BLOCK::reflect_in_y_axis() {
+  ICOORDELT *pt;                 // current point
+  ICOORDELT_IT pts = &vertices;  // Iterator.
+
+  do {
+    pt = pts.data();
+    pt->set_x(-pt->x());
+    pts.forward();
+  }
+  while (!pts.at_first());
+  compute_bb();
+}
+

 /**
 * POLY_BLOCK::move
@ -384,6 +402,8 @@ ScrollView::Color POLY_BLOCK::ColorForPolyBlockType(PolyBlockType type) {
    ScrollView::BLUE,         // Text that lives inside a column.
    ScrollView::CYAN,         // Text that spans more than one column.
    ScrollView::MEDIUM_BLUE,  // Text that is in a cross-column pull-out region.
+    ScrollView::AQUAMARINE,   // Partition belonging to an equation region.
+    ScrollView::SKY_BLUE,   // Partition belonging to an inline equation region.
    ScrollView::MAGENTA,      // Partition belonging to a table region.
    ScrollView::GREEN,        // Text-line runs vertically.
    ScrollView::LIGHT_BLUE,   // Text that belongs to an image.
--- a/ccstruct/polyblk.h
+++ b/ccstruct/polyblk.h
@ -58,6 +58,8 @@ class DLLSYM POLY_BLOCK {
  // Rotate about the origin by the given rotation. (Analogous to
  // multiplying by a complex number.
  void rotate(FCOORD rotation);
+  // Reflect the coords of the polygon in the y-axis. (Flip the sign of x.)
+  void reflect_in_y_axis();
  // Move by adding shift to all coordinates.
  void move(ICOORD shift);

--- a/ccstruct/publictypes.cpp
+++ b/ccstruct/publictypes.cpp
@ -25,6 +25,8 @@ const char* kPolyBlockNames[] = {
  "Flowing Text",
  "Heading Text",
  "Pullout Text",
+  "Equation",
+  "Inline Equation",
  "Table",
  "Vertical Text",
  "Caption Text",
--- a/ccstruct/publictypes.h
+++ b/ccstruct/publictypes.h
@ -41,6 +41,8 @@ enum PolyBlockType {
  PT_FLOWING_TEXT,   // Text that lives inside a column.
  PT_HEADING_TEXT,   // Text that spans more than one column.
  PT_PULLOUT_TEXT,   // Text that is in a cross-column pull-out region.
+  PT_EQUATION,       // Partition belonging to an equation region.
+  PT_INLINE_EQUATION,  // Partition has inline equation.
  PT_TABLE,          // Partition belonging to a table region.
  PT_VERTICAL_TEXT,  // Text-line runs vertically.
  PT_CAPTION_TEXT,   // Text that belongs to an image.
@ -66,7 +68,8 @@ inline bool PTIsImageType(PolyBlockType type) {
 inline bool PTIsTextType(PolyBlockType type) {
  return type == PT_FLOWING_TEXT || type == PT_HEADING_TEXT ||
         type == PT_PULLOUT_TEXT || type == PT_TABLE ||
-         type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT;
+         type == PT_VERTICAL_TEXT || type == PT_CAPTION_TEXT ||
+         type == PT_INLINE_EQUATION;
 }

 // String name for each block type. Keep in sync with PolyBlockType.
@ -165,9 +168,6 @@ enum PageSegMode {
 // enum of the elements of the page hierarchy, used in ResultIterator
 // to provide functions that operate on each level without having to
 // have 5x as many functions.
-// NOTE: At present RIL_PARA and RIL_BLOCK are equivalent as there is
-// no paragraph internally yet.
-// TODO(rays) Add paragraph detection.
 enum PageIteratorLevel {
  RIL_BLOCK,     // Block of text/image/separator line.
  RIL_PARA,      // Paragraph within a block.
@ -176,6 +176,35 @@ enum PageIteratorLevel {
  RIL_SYMBOL     // Symbol/character within a word.
 };

+// JUSTIFICATION_UNKNONW
+//   The alignment is not clearly one of the other options.  This could happen
+//   for example if there are only one or two lines of text or the text looks
+//   like source code or poetry.
+//
+// NOTA BENE: Fully justified paragraphs (text aligned to both left and right
+//    margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text
+//    is written with a left-to-right script and with JUSTIFICATION_RIGHT if
+//    their text is written in a right-to-left script.
+//
+// Interpretation for text read in vertical lines:
+//   "Left" is wherever the starting reading position is.
+//
+// JUSTIFICATION_LEFT
+//   Each line, except possibly the first, is flush to the same left tab stop.
+//
+// JUSTIFICATION_CENTER
+//   The text lines of the paragraph are centered about a line going
+//   down through their middle of the text lines.
+//
+// JUSTIFICATION_RIGHT
+//   Each line, except possibly the first, is flush to the same right tab stop.
+enum ParagraphJustification {
+  JUSTIFICATION_UNKNOWN,
+  JUSTIFICATION_LEFT,
+  JUSTIFICATION_CENTER,
+  JUSTIFICATION_RIGHT,
+};
+
 // When Tesseract/Cube is initialized we can choose to instantiate/load/run
 // only the Tesseract part, only the Cube part or both along with the combiner.
 // The preference of which engine to use is stored in tessedit_ocr_engine_mode.
--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@ -28,6 +28,36 @@ ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE)

 const float WERD_CHOICE::kBadRating = 100000.0;

+static const char kPermuterTypeNoPerm[] = "None";
+static const char kPermuterTypePuncPerm[] = "Punctuation";
+static const char kPermuterTypeTopPerm[] = "Top Choice";
+static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
+static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
+static const char kPermuterTypeNgramPerm[] = "Ngram";
+static const char kPermuterTypeNumberPerm[] = "Number";
+static const char kPermuterTypeUserPatPerm[] = "User Pattern";
+static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
+static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
+static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
+static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
+static const char kPermuterTypeCompoundPerm[] = "Compound";
+
+static const char * const kPermuterTypeNames[] = {
+    kPermuterTypeNoPerm,        // 0
+    kPermuterTypePuncPerm,      // 1
+    kPermuterTypeTopPerm,       // 2
+    kPermuterTypeLowerPerm,     // 3
+    kPermuterTypeUpperPerm,     // 4
+    kPermuterTypeNgramPerm,     // 5
+    kPermuterTypeNumberPerm,    // 6
+    kPermuterTypeUserPatPerm,   // 7
+    kPermuterTypeSysDawgPerm,   // 8
+    kPermuterTypeDocDawgPerm,   // 9
+    kPermuterTypeUserDawgPerm,  // 10
+    kPermuterTypeFreqDawgPerm,  // 11
+    kPermuterTypeCompoundPerm   // 12
+};
+
 /**
 * BLOB_CHOICE::BLOB_CHOICE
 *
@ -38,7 +68,10 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
                         float src_cert,           // certainty
                         inT16 src_fontinfo_id,     // font
                         inT16 src_fontinfo_id2,    // 2nd choice font
-                         int src_script_id         // script
+                         int src_script_id,        // script
+                         inT16 min_xheight,        // min xheight allowed
+                         inT16 max_xheight,        // max xheight by this char
+                         bool adapted              // adapted match or not
                        ) {
  unichar_id_ = src_unichar_id;
  rating_ = src_rating;
@ -47,6 +80,9 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
  fontinfo_id2_ = src_fontinfo_id2;
  script_id_ = src_script_id;
  language_model_state_ = NULL;
+  min_xheight_ = min_xheight;
+  max_xheight_ = max_xheight;
+  adapted_ = adapted;
 }

 /**
@ -62,6 +98,9 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  fontinfo_id2_ = other.fontinfo_id2();
  script_id_ = other.script_id();
  language_model_state_ = NULL;
+  min_xheight_ = other.min_xheight_;
+  max_xheight_ = other.max_xheight_;
+  adapted_ = other.adapted_;
 }

 /**
@ -71,7 +110,8 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
 * The function assumes that src_string is not NULL.
 */
 WERD_CHOICE::WERD_CHOICE(const char *src_string,
-                         const UNICHARSET &unicharset) {
+                         const UNICHARSET &unicharset)
+    : unicharset_(&unicharset){
  STRING src_lengths;
  const char *ptr = src_string;
  const char *end = src_string + strlen(src_string);
@ -80,7 +120,7 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
       step = unicharset.step(ptr), src_lengths += step, ptr += step);
  if (step != 0 && ptr == end) {
    this->init(src_string, src_lengths.string(),
-               0.0, 0.0, NO_PERM, unicharset);
+               0.0, 0.0, NO_PERM);
  } else {  // there must have been an invalid unichar in the string
    this->init(8);
    this->make_bad();
@ -101,8 +141,7 @@ void WERD_CHOICE::init(const char *src_string,
                       const char *src_lengths,
                       float src_rating,
                       float src_certainty,
-                       uinT8 src_permuter,
-                       const UNICHARSET &unicharset) {
+                       uinT8 src_permuter) {
  int src_string_len = strlen(src_string);
  if (src_string_len == 0) {
    this->init(8);
@ -113,7 +152,7 @@ void WERD_CHOICE::init(const char *src_string,
    for (int i = 0; i < length_; ++i) {
      int unichar_length = src_lengths ? src_lengths[i] : 1;
      unichar_ids_[i] =
-          unicharset.unichar_to_id(src_string+offset, unichar_length);
+          unicharset_->unichar_to_id(src_string+offset, unichar_length);
      fragment_lengths_[i] = 1;
      offset += unichar_length;
    }
@ -132,6 +171,9 @@ WERD_CHOICE::~WERD_CHOICE() {
  delete_blob_choices();
 }

+const char *WERD_CHOICE::permuter_name() const {
+  return kPermuterTypeNames[permuter_];
+}

 /**
 * WERD_CHOICE::set_blob_choices
@ -177,20 +219,86 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
  length_ -= num;
 }

+/**
+ * reverse_and_mirror_unichar_ids
+ *
+ * Reverses and mirrors unichars in unichar_ids.
+ * Note: this function does not change unichar_string_, it only modifies
+ * unichar_ids array.
+ */
+void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
+  for (int i = 0; i < length_/2; ++i) {
+    UNICHAR_ID tmp_id = unichar_ids_[i];
+    unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
+    unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
+  }
+  if (length_ % 2 != 0) {
+    unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
+  }
+}
+
+/**
+ * punct_stripped
+ *
+ * Returns the half-open interval of unichar_id indices [start, end) which
+ * enclose the core portion of this word -- the part after stripping
+ * punctuation from the left and right.
+ */
+void WERD_CHOICE::punct_stripped(int *start, int *end) const {
+  *start = 0;
+  *end = length() - 1;
+  while (*start < length() &&
+         unicharset()->get_ispunctuation(unichar_id(*start))) {
+    (*start)++;
+  }
+  while (*end > -1 &&
+         unicharset()->get_ispunctuation(unichar_id(*end))) {
+    (*end)--;
+  }
+  (*end)++;
+}
+
+WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
+  ASSERT_HOST(start >= 0 && start <= length_);
+  ASSERT_HOST(end >= 0 && end <= length_);
+  if (end < start) { end = start; }
+  WERD_CHOICE retval(unicharset_, end - start);
+  for (int i = start; i < end; i++) {
+    retval.append_unichar_id_space_allocated(
+        unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f);
+  }
+  return retval;
+}
+
+/**
+ * has_rtl_unichar_id
+ *
+ * Returns true if unichar_ids contain at least one "strongly" RTL unichar.
+ */
+bool WERD_CHOICE::has_rtl_unichar_id() const {
+  int i;
+  for (i = 0; i < length_; ++i) {
+    UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
+    if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
+        dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
+      return true;
+    }
+  }
+  return false;
+}
+
 /**
 * string_and_lengths
 *
 * Populates the given word_str with unichars from unichar_ids and
 * and word_lengths_str with the corresponding unichar lengths.
- * Uses current_unicharset to make unichar id -> unichar conversions.
 */
-void WERD_CHOICE::string_and_lengths(const UNICHARSET &current_unicharset,
-                                     STRING *word_str,
+void WERD_CHOICE::string_and_lengths(STRING *word_str,
                                     STRING *word_lengths_str) const {
  *word_str = "";
  if (word_lengths_str != NULL) *word_lengths_str = "";
  for (int i = 0; i < length_; ++i) {
-    const char *ch = current_unicharset.id_to_unichar(unichar_ids_[i]);
+    const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
    *word_str += ch;
    if (word_lengths_str != NULL) {
      *word_lengths_str += strlen(ch);
@ -230,6 +338,7 @@ WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
  //   word_lengths = NULL;
  //   delete_blob_choices();
  // } else {
+  ASSERT_HOST(unicharset_ == second.unicharset_);
  while (reserved_ < length_ + second.length()) {
    this->double_the_size();
  }
@ -291,6 +400,7 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
    this->double_the_size();
  }

+  unicharset_ = source.unicharset_;
  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
  const char *other_fragment_lengths = source.fragment_lengths();
  for (int i = 0; i < source.length(); ++i) {
@ -376,6 +486,24 @@ const void WERD_CHOICE::print(const char *msg) const {
  fflush(stdout);
 }

+bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
+                                       const WERD_CHOICE &word2) {
+  const UNICHARSET *uchset = word1.unicharset();
+  if (word2.unicharset() != uchset) return false;
+  int w1start, w1end;
+  word1.punct_stripped(&w1start, &w1end);
+  int w2start, w2end;
+  word2.punct_stripped(&w2start, &w2end);
+  if (w1end - w1start != w2end - w2start) return false;
+  for (int i = 0; i < w1end - w1start; i++) {
+    if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
+        uchset->to_lower(word2.unichar_id(w2start + i))) {
+        return false;
+    }
+  }
+  return true;
+}
+
 /**
 * print_ratings_list
 *
@ -499,3 +627,27 @@ void print_char_choices_list(const char *msg,
      print_ratings_list("", char_choices.get(x), current_unicharset);
  }
 }
+
+/**
+ * print_word_alternates_list
+ */
+void print_word_alternates_list(
+    WERD_CHOICE *word,
+    GenericVector<WERD_CHOICE *> *alternates,
+    bool needs_populate_unichars) {
+  if (!word || !alternates) return;
+  if (needs_populate_unichars) {
+    word->populate_unichars();
+    for (int i = 0; i < alternates->size(); ++i) {
+      alternates->get(i)->populate_unichars();
+    }
+  }
+
+  STRING alternates_str;
+  for (int i = 0; i < alternates->size(); i++) {
+    if (i > 0) alternates_str += "\", \"";
+    alternates_str += alternates->get(i)->unichar_string();
+  }
+  tprintf("Alternates for \"%s\": {\"%s\"}\n",
+          word->unichar_string().string(), alternates_str.string());
+}
--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@ -40,13 +40,19 @@ class BLOB_CHOICE: public ELIST_LINK
      certainty_ = -MAX_FLOAT32;
      script_id_ = -1;
      language_model_state_ = NULL;
+      min_xheight_ = 0;
+      max_xheight_ = 0;
+      adapted_ = false;
    }
    BLOB_CHOICE(UNICHAR_ID src_unichar_id,  // character id
                float src_rating,          // rating
                float src_cert,            // certainty
                inT16 src_fontinfo_id,      // font
                inT16 src_fontinfo_id2,     // 2nd choice font
-                int script_id);            // script
+                int script_id,             // script
+                inT16 min_xheight,         // min xheight in image pixel units
+                inT16 max_xheight,         // max xheight allowed by this char
+                bool adapted);             // adapted match or not
    BLOB_CHOICE(const BLOB_CHOICE &other);
    ~BLOB_CHOICE() {}

@ -71,12 +77,21 @@ class BLOB_CHOICE: public ELIST_LINK
    void *language_model_state() {
      return language_model_state_;
    }
-    inT16 xgap_before() {
+    inT16 xgap_before() const {
      return xgap_before_;
    }
-    inT16 xgap_after() {
+    inT16 xgap_after() const {
      return xgap_after_;
    }
+    inT16 min_xheight() const {
+      return min_xheight_;
+    }
+    inT16 max_xheight() const {
+      return max_xheight_;
+    }
+    bool adapted() const {
+      return adapted_;
+    }

    void set_unichar_id(UNICHAR_ID newunichar_id) {
      unichar_id_ = newunichar_id;
@ -105,6 +120,9 @@ class BLOB_CHOICE: public ELIST_LINK
    void set_xgap_after(inT16 gap) {
      xgap_after_ = gap;
    }
+    void set_adapted(bool adapted) {
+      adapted_ = adapted;
+    }
    static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
      BLOB_CHOICE* choice = new BLOB_CHOICE;
      *choice = *src;
@ -130,6 +148,10 @@ class BLOB_CHOICE: public ELIST_LINK
  void *language_model_state_;
  inT16 xgap_before_;
  inT16 xgap_after_;
+  // X-height range (in image pixels) that this classification supports.
+  inT16 min_xheight_;
+  inT16 max_xheight_;
+  bool adapted_;  // true if this is a match from adapted templates
 };

 // Make BLOB_CHOICE listable.
@ -156,24 +178,30 @@ class WERD_CHOICE {
 public:
  static const float kBadRating;

-  WERD_CHOICE() { this->init(8); }
-  WERD_CHOICE(int reserved) { this->init(reserved); }
+  WERD_CHOICE(const UNICHARSET *unicharset)
+    : unicharset_(unicharset) { this->init(8); }
+  WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
+    : unicharset_(unicharset) { this->init(reserved); }
  WERD_CHOICE(const char *src_string,
              const char *src_lengths,
              float src_rating,
              float src_certainty,
              uinT8 src_permuter,
-              const UNICHARSET &unicharset) {
+              const UNICHARSET &unicharset)
+    : unicharset_(&unicharset) {
    this->init(src_string, src_lengths, src_rating,
-               src_certainty, src_permuter, unicharset);
+               src_certainty, src_permuter);
  }
-  WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset);
-  WERD_CHOICE(const WERD_CHOICE &word) {
+  WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
+  WERD_CHOICE(const WERD_CHOICE &word) : unicharset_(word.unicharset_) {
    this->init(word.length());
    this->operator=(word);
  }
  ~WERD_CHOICE();

+  const UNICHARSET *unicharset() const {
+    return unicharset_;
+  }
  inline int length() const {
    return length_;
  }
@ -200,6 +228,7 @@ class WERD_CHOICE {
  inline uinT8 permuter() const {
    return permuter_;
  }
+  const char *permuter_name() const;
  inline bool fragment_mark() const {
    return fragment_mark_;
  }
@ -237,25 +266,37 @@ class WERD_CHOICE {

  /// Make more space in unichar_id_ and fragment_lengths_ arrays.
  inline void double_the_size() {
-    unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
-        reserved_, unichar_ids_);
-    fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
-        reserved_, fragment_lengths_);
-    reserved_ *= 2;
+    if (reserved_ > 0) {
+      unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
+          reserved_, unichar_ids_);
+      fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
+          reserved_, fragment_lengths_);
+      reserved_ *= 2;
+    } else {
+      unichar_ids_ = new UNICHAR_ID[1];
+      fragment_lengths_ = new char[1];
+      reserved_ = 1;
+    }
  }

  /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
  /// fragment_length_ arrays. Sets other values to default (blank) values.
  inline void init(int reserved) {
    reserved_ = reserved;
-    unichar_ids_ = new UNICHAR_ID[reserved];
-    fragment_lengths_ = new char[reserved];
+    if (reserved > 0) {
+      unichar_ids_ = new UNICHAR_ID[reserved];
+      fragment_lengths_ = new char[reserved];
+    } else {
+      unichar_ids_ = NULL;
+      fragment_lengths_ = NULL;
+    }
    length_ = 0;
    rating_ = 0.0;
    certainty_ = MAX_FLOAT32;
    permuter_ = NO_PERM;
    fragment_mark_ = false;
    blob_choices_ = NULL;
+    unichars_in_script_order_ = false;  // Tesseract is strict left-to-right.
    unichar_string_ = "";
    unichar_lengths_ = "";
  }
@ -267,7 +308,7 @@ class WERD_CHOICE {
  /// in src_string are assumed to all be of length 1.
  void init(const char *src_string, const char *src_lengths,
            float src_rating, float src_certainty,
-            uinT8 src_permuter, const UNICHARSET &current_unicharset);
+            uinT8 src_permuter);

  /// Set the fields in this choice to be default (bad) values.
  inline void make_bad() {
@ -308,13 +349,26 @@ class WERD_CHOICE {
  bool contains_unichar_id(UNICHAR_ID unichar_id) const;
  void remove_unichar_ids(int index, int num);
  inline void remove_last_unichar_id() { --length_; }
-  inline void remove_unichar_id(int index) { this->remove_unichar_ids(index, 1); }
-  void string_and_lengths(const UNICHARSET &current_unicharset,
-                          STRING *word_str, STRING *word_lengths_str) const;
-  const STRING debug_string(const UNICHARSET &current_unicharset) const {
+  inline void remove_unichar_id(int index) {
+    this->remove_unichar_ids(index, 1);
+  }
+  bool has_rtl_unichar_id() const;
+  void reverse_and_mirror_unichar_ids();
+
+  // Returns the half-open interval of unichar_id indices [start, end) which
+  // enclose the core portion of this word -- the part after stripping
+  // punctuation from the left and right.
+  void punct_stripped(int *start_core, int *end_core) const;
+
+  // Return a copy of this WERD_CHOICE with the choices [start, end).
+  // The result is useful only for checking against a dictionary.
+  WERD_CHOICE shallow_copy(int start, int end) const;
+
+  void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
+  const STRING debug_string() const {
    STRING word_str;
    for (int i = 0; i < length_; ++i) {
-      word_str += current_unicharset.debug_str(unichar_ids_[i]);
+      word_str += unicharset_->debug_str(unichar_ids_[i]);
      word_str += " ";
    }
    return word_str;
@ -322,16 +376,28 @@ class WERD_CHOICE {
  /// Since this function walks over the whole word to convert unichar ids
  /// to unichars, it is best to call it once, e.g. after all changes to
  /// unichar_ids_ in WERD_CHOICE are finished.
-  void populate_unichars(const UNICHARSET &current_unicharset) {
-    this->string_and_lengths(current_unicharset, &unichar_string_,
-                             &unichar_lengths_);
+  void populate_unichars() {
+    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
  }
+
  /// Undoes populate_unichars, so that unichar_string_ and unichar_lengths_
  /// are empty.
  void depopulate_unichars() {
    unichar_string_ = "";
    unichar_lengths_ = "";
  }
+
+  // Call this to override the default (strict left to right graphemes)
+  // with the fact that some engine produces a "reading order" set of
+  // Graphemes for each word.
+  bool set_unichars_in_script_order(bool in_script_order) {
+    return unichars_in_script_order_ = in_script_order;
+  }
+
+  bool unichars_in_script_order() const {
+    return unichars_in_script_order_;
+  }
+
  /// This function should only be called if populate_unichars()
  /// was called and WERD_CHOICE did not change since then.
  const STRING &unichar_string() const {
@ -339,6 +405,7 @@ class WERD_CHOICE {
           unichar_string_.length() >= length_);  // sanity check
    return unichar_string_;
  }
+
  /// This function should only be called if populate_unichars()
  /// was called and WERD_CHOICE did not change since then.
  const STRING &unichar_lengths() const {
@ -355,6 +422,7 @@ class WERD_CHOICE {
  WERD_CHOICE& operator= (const WERD_CHOICE& source);

 private:
+  const UNICHARSET *unicharset_;
  UNICHAR_ID *unichar_ids_;  // unichar ids that represent the text of the word
  char *fragment_lengths_;   // number of fragments in each unichar
  int reserved_;             // size of the above arrays
@ -367,10 +435,17 @@ class WERD_CHOICE {
                             // contained a fragment
  BLOB_CHOICE_LIST_CLIST *blob_choices_;  // best choices for each blob

+  // Normally, the blob_choices_ represent the recognition results in order
+  // from left-to-right.  However, some engines (say Cube) may return
+  // recognition results in the order of the script's major reading direction
+  // (for Arabic, that is right-to-left).
+  bool unichars_in_script_order_;
+
  // The following variables are only populated by calling populate_unichars().
  // They are not synchronized with the values in unichar_ids otherwise.
  STRING unichar_string_;
  STRING unichar_lengths_;
+
  bool unichar_info_present;

 private:
@ -382,6 +457,12 @@ ELISTIZEH (WERD_CHOICE)
 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;

+// Utilities for comparing WERD_CHOICEs
+
+bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
+                                       const WERD_CHOICE &word2);
+
+// Utilities for debug printing.
 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
 void print_ratings_list(
    const char *msg,                      // intro message
@ -401,5 +482,9 @@ void print_char_choices_list(
    const UNICHARSET &current_unicharset,
    BOOL8 detailed
    );
+void print_word_alternates_list(
+    WERD_CHOICE *word,
+    GenericVector<WERD_CHOICE *> *alternates,
+    bool needs_populate_unichars);

 #endif
--- a/ccstruct/rect.cpp
+++ b/ccstruct/rect.cpp
@ -172,6 +172,19 @@ void TBOX::plot(                      //paint box
 }
 #endif

+// Writes to the given file. Returns false in case of error.
+bool TBOX::Serialize(FILE* fp) const {
+  if (!bot_left.Serialize(fp)) return false;
+  if (!top_right.Serialize(fp)) return false;
+  return true;
+}
+// Reads from the given file. Returns false in case of error.
+// If swap is true, assumes a big/little-endian swap is needed.
+bool TBOX::DeSerialize(bool swap, FILE* fp) {
+  if (!bot_left.DeSerialize(swap, fp)) return false;
+  if (!top_right.DeSerialize(swap, fp)) return false;
+  return true;
+}

 /**********************************************************************
 * operator+=
@ -200,15 +213,12 @@ const TBOX & op2) {


 /**********************************************************************
- * operator-=
+ * operator&=
 *
 * Reduce one box to intersection with the other  (In place intersection)
 **********************************************************************/

-DLLSYM TBOX &
-operator-= (                     //inplace intersection
-TBOX & op1,                       //operands
-const TBOX & op2) {
+TBOX& operator&=(TBOX& op1, const TBOX& op2) {
  if (op1.overlap (op2)) {
    if (op2.bot_left.x () > op1.bot_left.x ())
      op1.bot_left.set_x (op2.bot_left.x ());
@ -230,3 +240,15 @@ const TBOX & op2) {
  }
  return op1;
 }
+
+bool TBOX::x_almost_equal(const TBOX &box, int tolerance) const {
+  return (abs(left() - box.left()) <= tolerance &&
+           abs(right() - box.right()) <= tolerance);
+}
+
+bool TBOX::almost_equal(const TBOX &box, int tolerance) const {
+  return (abs(left() - box.left()) <= tolerance &&
+          abs(right() - box.right()) <= tolerance &&
+          abs(top() - box.top()) <= tolerance &&
+          abs(bottom() - box.bottom()) <= tolerance);
+}
--- a/ccstruct/rect.h
+++ b/ccstruct/rect.h
@ -23,8 +23,8 @@
 #include <math.h>
 #include "points.h"
 #include "ndminx.h"
-#include "tprintf.h"
 #include "scrollview.h"
+#include "tprintf.h"

 class DLLSYM TBOX  {  // bounding box
  public:
@ -46,7 +46,7 @@ class DLLSYM TBOX  {  // bounding box
      return ((left () >= right ()) || (top () <= bottom ()));
    }

-    bool operator==(const TBOX& other) {
+    bool operator==(const TBOX& other) const {
      return bot_left == other.bot_left && top_right == other.top_right;
    }

@ -115,6 +115,14 @@ class DLLSYM TBOX  {  // bounding box
        return 0;
    }

+    // Pads the box on either side by the supplied x,y pad amounts.
+    // NO checks for exceeding any bounds like 0 or an image size.
+    void pad(int xpad, int ypad) {
+      ICOORD pad(xpad, ypad);
+      bot_left -= pad;
+      top_right += pad;
+    }
+
    void move_bottom_edge(                  // move one edge
                          const inT16 y) {  // by +/- y
      bot_left += ICOORD (0, y);
@ -232,6 +240,12 @@ class DLLSYM TBOX  {  // bounding box
    // fraction of the current box's projected area covered by the other's
    double y_overlap_fraction(const TBOX& box) const;

+    // Returns true if the boxes are almost equal on x axis.
+    bool x_almost_equal(const TBOX &box, int tolerance) const;
+
+    // Returns true if the boxes are almost equal
+    bool almost_equal(const TBOX &box, int tolerance) const;
+
    TBOX intersection(  // shared area box
                     const TBOX &box) const;

@ -251,6 +265,15 @@ class DLLSYM TBOX  {  // bounding box
              left(), bottom(), right(), top());
    }

+    // Same as print(), but appends debug information to the given string
+    // instead of printing it to stdout.
+    void append_debug(STRING *str) const {
+      char buffer[256];
+      sprintf(buffer, "Bounding box=(%d,%d)->(%d,%d)\n",
+              left(), bottom(), right(), top());
+      *str += buffer;
+    }
+
 #ifndef GRAPHICS_DISABLED
    void plot(                    // use current settings
              ScrollView* fd) const {  // where to paint
@ -263,10 +286,15 @@ class DLLSYM TBOX  {  // bounding box
              ScrollView::Color fill_colour,           // colour for inside
              ScrollView::Color border_colour) const;  // colour for border
 #endif
+    // Writes to the given file. Returns false in case of error.
+    bool Serialize(FILE* fp) const;
+    // Reads from the given file. Returns false in case of error.
+    // If swap is true, assumes a big/little-endian swap is needed.
+    bool DeSerialize(bool swap, FILE* fp);

-    friend DLLSYM TBOX & operator+= (TBOX &, const TBOX &);
+    friend TBOX& operator+=(TBOX&, const TBOX&);
    // in place union
-    friend DLLSYM TBOX & operator-= (TBOX &, const TBOX &);
+    friend TBOX& operator&=(TBOX&, const TBOX&);
    // in place intersection

  private:
--- a/ccstruct/seam.cpp
+++ b/ccstruct/seam.cpp
@ -72,6 +72,29 @@ bool point_in_seam(SEAM *seam, SPLIT *split) {
          point_in_split(seam->split3, split->point1, split->point2));
 }

+/**
+ * @name point_used_by_split
+ *
+ * Return whether this particular EDGEPT * is used in a given split.
+ * @returns TRUE if the edgept is used by the split.
+ */
+bool point_used_by_split(SPLIT *split, EDGEPT *point) {
+  if (split == NULL) return false;
+  return point == split->point1 || point == split->point2;
+}
+
+/**
+ * @name point_used_by_seam
+ *
+ * Return whether this particular EDGEPT * is used in a given seam.
+ * @returns TRUE if the edgept is used by the seam.
+ */
+bool point_used_by_seam(SEAM *seam, EDGEPT *point) {
+  if (seam == NULL) return false;
+  return point_used_by_split(seam->split1, point) ||
+      point_used_by_split(seam->split2, point) ||
+      point_used_by_split(seam->split3, point);
+}

 /**
 * @name add_seam
@ -152,28 +175,20 @@ void delete_seam(void *arg) {  //SEAM  *seam)
 SEAMS start_seam_list(TBLOB *blobs) {
  TBLOB *blob;
  SEAMS seam_list;
-  TPOINT topleft;
-  TPOINT botright;
  TPOINT location;
  /* Seam slot per char */
  seam_list = new_seam_list ();

  for (blob = blobs; blob->next != NULL; blob = blob->next) {
-
-    blob_bounding_box(blob, &topleft, &botright);
-    location.x = botright.x;
-    location.y = botright.y + topleft.y;
-    blob_bounding_box (blob->next, &topleft, &botright);
-    location.x += topleft.x;
-    location.y += botright.y + topleft.y;
-    location.x /= 2;
-    location.y /= 4;
-
-    seam_list = add_seam (seam_list,
-      new_seam (0.0, location, NULL, NULL, NULL));
+    TBOX bbox = blob->bounding_box();
+    TBOX nbox = blob->next->bounding_box();
+    location.x = (bbox.right() + nbox.left()) / 2;
+    location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;
+    seam_list = add_seam(seam_list,
+        new_seam(0.0, location, NULL, NULL, NULL));
  }

-  return (seam_list);
+  return seam_list;
 }

 /**
--- a/ccstruct/seam.h
+++ b/ccstruct/seam.h
@ -94,6 +94,10 @@ bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2);

 bool point_in_seam(SEAM *seam, SPLIT *split);

+bool point_used_by_split(SPLIT *split, EDGEPT *point);
+
+bool point_used_by_seam(SEAM *seam, EDGEPT *point);
+
 SEAMS add_seam(SEAMS seam_list, SEAM *seam);

 void combine_seams(SEAM *dest_seam, SEAM *source_seam);
--- a/ccstruct/split.cpp
+++ b/ccstruct/split.cpp
@ -62,7 +62,7 @@ void delete_split(SPLIT *split) {
 *
 * Create an EDGEPT and hook it into an existing list of edge points.
 **********************************************************************/
-EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) { 
+EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
  EDGEPT *this_edgept;
  /* Create point */
  this_edgept = new EDGEPT;
@ -82,6 +82,20 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
  return (this_edgept);
 }

+/**********************************************************************
+ * remove_edgept
+ *
+ * Remove a given EDGEPT from its list and delete it.
+ **********************************************************************/
+void remove_edgept(EDGEPT *point) {
+  EDGEPT *prev = point->prev;
+  EDGEPT *next = point->next;
+  prev->next = next;
+  next->prev = prev;
+  prev->vec.x = next->pos.x - prev->pos.x;
+  prev->vec.y = next->pos.y - prev->pos.y;
+  delete point;
+}

 /**********************************************************************
 * new_split
--- a/ccstruct/split.h
+++ b/ccstruct/split.h
@ -72,6 +72,8 @@ void delete_split(SPLIT *split);

 EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev);

+void remove_edgept(EDGEPT *point);
+
 SPLIT *new_split(EDGEPT *point1, EDGEPT *point2);

 void print_split(SPLIT *split);
--- a/ccstruct/stepblob.cpp
+++ b/ccstruct/stepblob.cpp
@ -340,6 +340,15 @@ static void render_outline_list(C_OUTLINE_LIST *list,
  }
 }

+static void render_outline_list_outline(C_OUTLINE_LIST *list,
+                                        int left, int top, Pix* pix) {
+  C_OUTLINE_IT it(list);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    C_OUTLINE* outline = it.data();
+    outline->render_outline(left, top, pix);
+  }
+}
+
 // Returns a Pix rendering of the blob. pixDestroy after use.
 Pix* C_BLOB::render() {
  TBOX box = bounding_box();
@ -348,6 +357,15 @@ Pix* C_BLOB::render() {
  return pix;
 }

+// Returns a Pix rendering of the outline of the blob. (no fill).
+// pixDestroy after use.
+Pix* C_BLOB::render_outline() {
+  TBOX box = bounding_box();
+  Pix* pix = pixCreate(box.width(), box.height(), 1);
+  render_outline_list_outline(&outlines, box.left(), box.top(), pix);
+  return pix;
+}
+
 /**********************************************************************
 * C_BLOB::plot
 *
--- a/ccstruct/stepblob.h
+++ b/ccstruct/stepblob.h
@ -55,6 +55,9 @@ class C_BLOB:public ELIST_LINK

    // Returns a Pix rendering of the blob. pixDestroy after use.
    Pix* render();
+    // Returns a Pix rendering of the outline of the blob. (no fill).
+    // pixDestroy after use.
+    Pix* render_outline();

    void plot(                       //draw one
              ScrollView* window,         //window to draw in
--- a/ccstruct/vecfuncs.h
+++ b/ccstruct/vecfuncs.h
@ -28,7 +28,7 @@
 #include <math.h>
 #include "blobs.h"

-struct EDGEPT;
+class EDGEPT;

 /*----------------------------------------------------------------------
              M a c r o s
--- a/ccstruct/werd.cpp
+++ b/ccstruct/werd.cpp
@ -452,6 +452,8 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
    }
    if (!found) {
      not_found_it.add_after_then_move(werd_blob);
+    } else {
+      delete werd_blob;
    }
  }
  // Iterate over all not found blobs. Some of them may be due to
@ -462,7 +464,6 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
       not_found_it.forward()) {
    C_BLOB* not_found = not_found_it.data();
    TBOX not_found_box = not_found->bounding_box();
-    bool found = false;
    C_BLOB_IT existing_blobs_it(new_blobs_it);
    for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
         existing_blobs_it.forward()) {
@ -472,8 +473,8 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
           a_blob_box.major_overlap(not_found_box)) &&
           not_found_box.y_overlap(a_blob_box) > 0.8) {
        // Already taken care of.
-        found = true;
-        not_found_it.extract();
+        delete not_found_it.extract();
+        break;
      }
    }
  }
@ -487,6 +488,10 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
  WERD* new_werd = NULL;
  if (!new_werd_blobs.empty()) {
    new_werd = new WERD(&new_werd_blobs, this);
+  } else {
+    // Add the blobs back to this word so that it can be reused.
+    C_BLOB_IT this_list_it(cblob_list());
+    this_list_it.add_list_after(&not_found_blobs);
  }
  return new_werd;
 }
--- a/ccstruct/werd.h
+++ b/ccstruct/werd.h
@ -51,7 +51,8 @@ enum DISPLAY_FLAGS
  DF_TEXT,                       //< Correct ascii
  DF_POLYGONAL,                  //< Polyg approx
  DF_EDGE_STEP,                  //< Edge steps
-  DF_BN_POLYGONAL                //< BL normalisd polyapx
+  DF_BN_POLYGONAL,               //< BL normalisd polyapx
+  DF_BLAMER                      //< Blamer information
 };

 class ROW;                       //forward decl