Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@648 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 09:52:40 +08:00 · 2012-02-02 02:53:04 +00:00 · 2012-02-02 02:53:04 +00:00 · 6e3d810c1d
commit 6e3d810c1d
parent 04068c7055
42 changed files with 8993 additions and 2833 deletions
--- a/textord/Makefile.am
+++ b/textord/Makefile.am
@ -6,14 +6,15 @@ AM_CPPFLAGS = \
    -I$(top_srcdir)/cutil -I$(top_srcdir)/classify -I$(top_srcdir)/dict

 include_HEADERS = \
-    alignedblob.h bbgrid.h blkocc.h \
-    colfind.h colpartition.h colpartitionset.h \
+    alignedblob.h bbgrid.h blkocc.h blobgrid.h \
+    ccnontextdetect.h cjkpitch.h colfind.h colpartition.h colpartitionset.h \
    colpartitiongrid.h \
    devanagari_processing.h drawedg.h drawtord.h edgblob.h edgloop.h \
+    equationdetectbase.h \
    fpchop.h gap_map.h imagefind.h linefind.h makerow.h oldbasel.h \
    pithsync.h pitsync1.h scanedg.h sortflts.h strokewidth.h \
    tabfind.h tablefind.h tabvector.h \
-    tablerecog.h textord.h \
+    tablerecog.h textlineprojection.h textord.h \
    topitch.h tordmain.h tovars.h \
    underlin.h wordseg.h workingpartset.h

@ -34,13 +35,14 @@ libtesseract_textord_la_LIBADD = \
 endif

 libtesseract_textord_la_SOURCES = \
-    alignedblob.cpp bbgrid.cpp blkocc.cpp \
-    colfind.cpp colpartition.cpp colpartitionset.cpp \
+    alignedblob.cpp bbgrid.cpp blkocc.cpp blobgrid.cpp \
+    ccnontextdetect.cpp cjkpitch.cpp colfind.cpp colpartition.cpp colpartitionset.cpp \
    colpartitiongrid.cpp devanagari_processing.cpp \
    drawedg.cpp drawtord.cpp edgblob.cpp edgloop.cpp \
+    equationdetectbase.cpp \
    fpchop.cpp gap_map.cpp imagefind.cpp linefind.cpp makerow.cpp oldbasel.cpp \
    pithsync.cpp pitsync1.cpp scanedg.cpp sortflts.cpp strokewidth.cpp \
    tabfind.cpp tablefind.cpp tabvector.cpp \
-    tablerecog.cpp textord.cpp \
+    tablerecog.cpp textlineprojection.cpp textord.cpp \
    topitch.cpp tordmain.cpp tospace.cpp tovars.cpp \
    underlin.cpp wordseg.cpp workingpartset.cpp
--- a/textord/alignedblob.cpp
+++ b/textord/alignedblob.cpp
@ -39,11 +39,11 @@ namespace tesseract {
 // Fraction of resolution used as alignment tolerance for aligned tabs.
 const double kAlignedFraction = 0.03125;
 // Fraction of resolution used as alignment tolerance for ragged tabs.
-const double kRaggedFraction = 0.5;
+const double kRaggedFraction = 2.5;
 // Fraction of height used as a minimum gutter gap for aligned blobs.
 const double kAlignedGapFraction = 0.75;
 // Fraction of height used as a minimum gutter gap for ragged tabs.
-const double kRaggedGapFraction = 3.0;
+const double kRaggedGapFraction = 1.0;
 // Constant number of pixels used as alignment tolerance for line finding.
 const int kVLineAlignment = 3;
 // Constant number of pixels used as gutter gap tolerance for line finding.
@ -163,7 +163,7 @@ void AlignedBlobParams::set_vertical(int vertical_x, int vertical_y) {

 AlignedBlob::AlignedBlob(int gridsize,
                         const ICOORD& bleft, const ICOORD& tright)
-  : BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>(gridsize, bleft, tright) {
+  : BlobGrid(gridsize, bleft, tright) {
 }

 AlignedBlob::~AlignedBlob() {
@ -196,24 +196,24 @@ ScrollView* AlignedBlob::DisplayTabs(const char* window_name,
    int bottom_y = box.bottom();
    TabType tabtype = bbox->left_tab_type();
    if (tabtype != TT_NONE) {
-      if (tabtype == TT_UNCONFIRMED)
+      if (tabtype == TT_MAYBE_ALIGNED)
        tab_win->Pen(ScrollView::BLUE);
+      else if (tabtype == TT_MAYBE_RAGGED)
+        tab_win->Pen(ScrollView::YELLOW);
      else if (tabtype == TT_CONFIRMED)
        tab_win->Pen(ScrollView::GREEN);
-      else if (tabtype == TT_FAKE)
-        tab_win->Pen(ScrollView::YELLOW);
      else
        tab_win->Pen(ScrollView::GREY);
      tab_win->Line(left_x, top_y, left_x, bottom_y);
    }
    tabtype = bbox->right_tab_type();
    if (tabtype != TT_NONE) {
-      if (tabtype == TT_UNCONFIRMED)
+      if (tabtype == TT_MAYBE_ALIGNED)
        tab_win->Pen(ScrollView::MAGENTA);
+      else if (tabtype == TT_MAYBE_RAGGED)
+        tab_win->Pen(ScrollView::ORANGE);
      else if (tabtype == TT_CONFIRMED)
        tab_win->Pen(ScrollView::RED);
-      else if (tabtype == TT_FAKE)
-        tab_win->Pen(ScrollView::ORANGE);
      else
        tab_win->Pen(ScrollView::GREY);
      tab_win->Line(right_x, top_y, right_x, bottom_y);
@ -224,6 +224,17 @@ ScrollView* AlignedBlob::DisplayTabs(const char* window_name,
  return tab_win;
 }

+// Helper returns true if the total number of line_crossings of all the blobs
+// in the list is at least 2.
+static bool AtLeast2LineCrossings(BLOBNBOX_CLIST* blobs) {
+  BLOBNBOX_C_IT it(blobs);
+  int total_crossings = 0;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    total_crossings += it.data()->line_crossings();
+  }
+  return total_crossings >= 2;
+}
+
 // Finds a vector corresponding to a set of vertically aligned blob edges
 // running through the given box. The type of vector returned and the
 // search parameters are determined by the AlignedBlobParams.
@ -237,11 +248,13 @@ TabVector* AlignedBlob::FindVerticalAlignment(AlignedBlobParams align_params,
  int ext_start_y, ext_end_y;
  BLOBNBOX_CLIST good_points;
  // Search up and then down from the starting bbox.
+  TBOX box = bbox->bounding_box();
+  bool debug = WithinTestRegion(2, box.left(), box.bottom());
  int pt_count = AlignTabs(align_params, false, bbox, &good_points, &ext_end_y);
  pt_count += AlignTabs(align_params, true, bbox, &good_points, &ext_start_y);
  BLOBNBOX_C_IT it(&good_points);
  it.move_to_last();
-  TBOX box = it.data()->bounding_box();
+  box = it.data()->bounding_box();
  int end_y = box.top();
  int end_x = align_params.right_tab ? box.right() : box.left();
  it.move_to_first();
@ -251,9 +264,14 @@ TabVector* AlignedBlob::FindVerticalAlignment(AlignedBlobParams align_params,
  // Acceptable tab vectors must have a mininum number of points,
  // have a minimum acceptable length, and have a minimum gradient.
  // The gradient corresponds to the skew angle.
-  if (pt_count >= align_params.min_points &&
+  // Ragged tabs don't need to satisfy the gradient condition, as they
+  // will always end up parallel to the vertical direction.
+  bool at_least_2_crossings = AtLeast2LineCrossings(&good_points);
+  if ((pt_count >= align_params.min_points &&
      end_y - start_y >= align_params.min_length &&
-      end_y - start_y >= abs(end_x - start_x) * kMinTabGradient) {
+      (align_params.ragged ||
+          end_y - start_y >= abs(end_x - start_x) * kMinTabGradient)) ||
+      at_least_2_crossings) {
    int confirmed_points = 0;
    // Count existing confirmed points to see if vector is acceptable.
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
@ -270,7 +288,7 @@ TabVector* AlignedBlob::FindVerticalAlignment(AlignedBlobParams align_params,
    if (!align_params.ragged ||
        confirmed_points + confirmed_points < pt_count) {
      const TBOX& box = bbox->bounding_box();
-      if (WithinTestRegion(2, box.left(), box.bottom())) {
+      if (debug) {
        tprintf("Confirming tab vector of %d pts starting at %d,%d\n",
                pt_count, box.left(), box.bottom());
      }
@ -282,6 +300,9 @@ TabVector* AlignedBlob::FindVerticalAlignment(AlignedBlobParams align_params,
        } else {
          bbox->set_left_tab_type(align_params.confirmed_type);
        }
+        if (debug) {
+          bbox->bounding_box().print();
+        }
      }
      // Now make the vector and return it.
      TabVector* result = TabVector::FitVector(align_params.alignment,
@ -289,12 +310,21 @@ TabVector* AlignedBlob::FindVerticalAlignment(AlignedBlobParams align_params,
                                               ext_start_y, ext_end_y,
                                               &good_points,
                                               vertical_x, vertical_y);
-      if (WithinTestRegion(2, box.left(), box.bottom())) {
+      result->set_intersects_other_lines(at_least_2_crossings);
+      if (debug) {
        tprintf("Box was %d, %d\n", box.left(), box.bottom());
        result->Print("After fitting");
      }
      return result;
+    } else if (debug) {
+      tprintf("Ragged tab used too many used points: %d out of %d\n",
+              confirmed_points, pt_count);
    }
+  } else if (debug) {
+    tprintf("Tab vector failed basic tests: pt count %d vs min %d, "
+            "length %d vs min %d, min grad %g\n",
+            pt_count, align_params.min_points, end_y - start_y,
+            align_params.min_length, abs(end_x - start_x) * kMinTabGradient);
  }
  return NULL;
 }
@ -310,13 +340,18 @@ int AlignedBlob::AlignTabs(const AlignedBlobParams& params,
  BLOBNBOX_C_IT it(good_points);

  TBOX box = bbox->bounding_box();
+  bool debug = WithinTestRegion(2, box.left(), box.bottom());
+  if (debug) {
+    tprintf("Starting alignment run at blob:");
+    box.print();
+  }
  int x_start = params.right_tab ? box.right() : box.left();
  while (bbox != NULL) {
    // Add the blob to the list if the appropriate side is a tab candidate,
    // or if we are working on a ragged tab.
-    if (((params.right_tab && bbox->right_tab_type() != TT_NONE) ||
-         (!params.right_tab && bbox->left_tab_type() != TT_NONE) ||
-         params.ragged) &&
+    TabType type = params.right_tab ? bbox->right_tab_type()
+                                    : bbox->left_tab_type();
+    if (((type != TT_NONE && type != TT_MAYBE_RAGGED) || params.ragged) &&
        (it.empty() || it.data() != bbox)) {
      if (top_to_bottom)
        it.add_before_then_move(bbox);
@ -335,6 +370,10 @@ int AlignedBlob::AlignTabs(const AlignedBlobParams& params,
        x_start = params.right_tab ? box.right() : box.left();
    }
  }
+  if (debug) {
+    tprintf("Alignment run ended with %d pts at blob:", ptcount);
+    box.print();
+  }
  return ptcount;
 }

@ -417,15 +456,12 @@ BLOBNBOX* AlignedBlob::FindAlignedBlob(const AlignedBlobParams& p,
    // waiting for a sequence of blobs in a line to end.
    // NextVerticalSearch alone does not guarantee this, as there may be
    // more than one blob in a grid cell. See comment in AlignTabs.
-    if ((n_y < start_y) != top_to_bottom || n_y == start_y)
+    if ((n_y < start_y) != top_to_bottom || nbox.y_overlap(box))
      continue;  // Only look in the required direction.
-    if (result != NULL &&
-        ((top_to_bottom && n_y < result->bounding_box().bottom()) ||
-         (!top_to_bottom && n_y > result->bounding_box().top())))
+    if (result != NULL && result->bounding_box().y_gap(nbox) > gridsize())
      return result;  // This result is clear.
-    if (backup_result != NULL && p.ragged &&
-        ((top_to_bottom && n_y < backup_result->bounding_box().bottom()) ||
-         (!top_to_bottom && n_y > backup_result->bounding_box().top())))
+    if (backup_result != NULL && p.ragged && result == NULL &&
+        backup_result->bounding_box().y_gap(nbox) > gridsize())
      return backup_result;  // This result is clear.

    // If the neighbouring blob is the wrong side of a separator line, then it
@ -446,7 +482,7 @@ BLOBNBOX* AlignedBlob::FindAlignedBlob(const AlignedBlobParams& p,
        n_right > x_at_n_y + p.r_align_tolerance &&
        (p.ragged || n_left < x_at_n_y + p.gutter_fraction * nbox.height())) {
      // In the gutter so end of line.
-      if (bbox->right_tab_type() >= TT_UNCONFIRMED)
+      if (bbox->right_tab_type() >= TT_MAYBE_ALIGNED)
        bbox->set_right_tab_type(TT_DELETED);
      *end_y = top_to_bottom ? nbox.top() : nbox.bottom();
      if (WithinTestRegion(2, x_start, start_y))
@ -458,7 +494,7 @@ BLOBNBOX* AlignedBlob::FindAlignedBlob(const AlignedBlobParams& p,
        n_right > x_at_n_y - p.min_gutter &&
        (p.ragged || n_right > x_at_n_y - p.gutter_fraction * nbox.height())) {
      // In the gutter so end of line.
-      if (bbox->left_tab_type() >= TT_UNCONFIRMED)
+      if (bbox->left_tab_type() >= TT_MAYBE_ALIGNED)
        bbox->set_left_tab_type(TT_DELETED);
      *end_y = top_to_bottom ? nbox.top() : nbox.bottom();
      if (WithinTestRegion(2, x_start, start_y))
@ -476,15 +512,23 @@ BLOBNBOX* AlignedBlob::FindAlignedBlob(const AlignedBlobParams& p,
        tprintf("aligned, seeking%d, l=%d, r=%d\n",
                p.right_tab, neighbour->left_tab_type(),
                neighbour->right_tab_type());
-      if ((p.right_tab && neighbour->right_tab_type() != TT_NONE) ||
-          (!p.right_tab && neighbour->left_tab_type() != TT_NONE)) {
+      TabType n_type = p.right_tab ? neighbour->right_tab_type()
+                                   : neighbour->left_tab_type();
+      if (n_type != TT_NONE && (p.ragged || n_type != TT_MAYBE_RAGGED)) {
        if (result == NULL) {
          result = neighbour;
        } else {
-          // Keep the closest neighbour.
-          int old_y = (result->bounding_box().top() +
-                       result->bounding_box().bottom()) / 2;
-          if (abs(n_y - start_y) < abs(old_y - start_y))
+          // Keep the closest neighbour by Euclidean distance.
+          // This prevents it from picking a tab blob in another column.
+          const TBOX& old_box = result->bounding_box();
+          int x_diff = p.right_tab ? old_box.right() : old_box.left();
+          x_diff -= x_at_n_y;
+          int y_diff = (old_box.top() + old_box.bottom()) / 2 - start_y;
+          int old_dist = x_diff * x_diff + y_diff * y_diff;
+          x_diff = n_x - x_at_n_y;
+          y_diff = n_y - start_y;
+          int new_dist = x_diff * x_diff + y_diff * y_diff;
+          if (new_dist < old_dist)
            result = neighbour;
        }
      } else if (backup_result == NULL) {
--- a/textord/alignedblob.h
+++ b/textord/alignedblob.h
@ -80,7 +80,7 @@ struct AlignedBlobParams {
 // The AlignedBlob class contains code to find vertically aligned blobs.
 // This is factored out into a separate class, so it can be used by both
 // vertical line finding (LineFind) and tabstop finding (TabFind).
-class AlignedBlob : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
+class AlignedBlob : public BlobGrid {
 public:
  AlignedBlob(int gridsize, const ICOORD& bleft, const ICOORD& tright);
  virtual ~AlignedBlob();
--- a/textord/bbgrid.cpp
+++ b/textord/bbgrid.cpp
@ -153,6 +153,61 @@ IntGrid* IntGrid::NeighbourhoodSum() const {
  return sumgrid;
 }

+// Returns true if more than half the area of the rect is covered by grid
+// cells that are over the theshold.
+bool IntGrid::RectMostlyOverThreshold(const TBOX& rect, int threshold) const {
+  int min_x, min_y, max_x, max_y;
+  GridCoords(rect.left(), rect.bottom(), &min_x, &min_y);
+  GridCoords(rect.right(), rect.top(), &max_x, &max_y);
+  int total_area = 0;
+  for (int y = min_y; y <= max_y; ++y) {
+    for (int x = min_x; x <= max_x; ++x) {
+      int value = GridCellValue(x, y);
+      if (value > threshold) {
+        TBOX cell_box(x * gridsize_, y * gridsize_,
+                      (x + 1) * gridsize_, (y + 1) * gridsize_);
+        cell_box &= rect;  // This is in-place box intersection.
+        total_area += cell_box.area();
+      }
+    }
+  }
+  return total_area * 2 > rect.area();
+}
+
+// Returns true if any cell value in the given rectangle is zero.
+bool IntGrid::AnyZeroInRect(const TBOX& rect) const {
+  int min_x, min_y, max_x, max_y;
+  GridCoords(rect.left(), rect.bottom(), &min_x, &min_y);
+  GridCoords(rect.right(), rect.top(), &max_x, &max_y);
+  for (int y = min_y; y <= max_y; ++y) {
+    for (int x = min_x; x <= max_x; ++x) {
+      if (GridCellValue(x, y) == 0)
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns a full-resolution binary pix in which each cell over the given
+// threshold is filled as a black square. pixDestroy after use.
+// Edge cells, which have a zero 4-neighbour, are not marked.
+Pix* IntGrid::ThresholdToPix(int threshold) const {
+  Pix* pix = pixCreate(tright().x() - bleft().x(),
+                       tright().y() - bleft().y(), 1);
+  int cellsize = gridsize();
+  for (int y = 0; y < gridheight(); ++y) {
+    for (int x = 0; x < gridwidth(); ++x) {
+      if (GridCellValue(x, y) > threshold &&
+          GridCellValue(x - 1, y) > 0 && GridCellValue(x + 1, y) > 0 &&
+              GridCellValue(x, y - 1) > 0 && GridCellValue(x, y + 1) > 0) {
+        pixRasterop(pix, x * cellsize, tright().y() - ((y + 1) * cellsize),
+                    cellsize, cellsize, PIX_SET, NULL, 0, 0);
+      }
+    }
+  }
+  return pix;
+}
+
 // Make a Pix of the correct scaled size for the TraceOutline functions.
 Pix* GridReducedPix(const TBOX& box, int gridsize,
                    ICOORD bleft, int* left, int* bottom) {
@ -232,4 +287,3 @@ Pix* TraceBlockOnReducedPix(BLOCK* block, int gridsize,
 }

 }  // namespace tesseract.
-
--- a/textord/bbgrid.h
+++ b/textord/bbgrid.h
@ -123,8 +123,7 @@ class IntGrid : public GridBase {
  IntGrid* NeighbourhoodSum() const;

  int GridCellValue(int grid_x, int grid_y) const {
-    ASSERT_HOST(grid_x >= 0 && grid_x < gridwidth());
-    ASSERT_HOST(grid_y >= 0 && grid_y < gridheight());
+    ClipGridCoords(&grid_x, &grid_y);
    return grid_[grid_y * gridwidth_ + grid_x];
  }
  void SetGridCell(int grid_x, int grid_y, int value) {
@ -132,6 +131,16 @@ class IntGrid : public GridBase {
    ASSERT_HOST(grid_y >= 0 && grid_y < gridheight());
    grid_[grid_y * gridwidth_ + grid_x] = value;
  }
+  // Returns true if more than half the area of the rect is covered by grid
+  // cells that are over the theshold.
+  bool RectMostlyOverThreshold(const TBOX& rect, int threshold) const;
+
+  // Returns true if any cell value in the given rectangle is zero.
+  bool AnyZeroInRect(const TBOX& rect) const;
+
+  // Returns a full-resolution binary pix in which each cell over the given
+  // threshold is filled as a black square. pixDestroy after use.
+  Pix* ThresholdToPix(int threshold) const;

 private:
  int* grid_;  // 2-d array of ints.
@ -373,6 +382,24 @@ int SortByBoxLeft(const void* void1, const void* void2) {
  return p1->bounding_box().top() - p2->bounding_box().top();
 }

+// Sort function to sort a BBC by bounding_box().right() in right-to-left order.
+template<class BBC>
+int SortRightToLeft(const void* void1, const void* void2) {
+  // The void*s are actually doubly indirected, so get rid of one level.
+  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
+  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
+  int result = p2->bounding_box().right() - p1->bounding_box().right();
+  if (result != 0)
+    return result;
+  result = p2->bounding_box().left() - p1->bounding_box().left();
+  if (result != 0)
+    return result;
+  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
+  if (result != 0)
+    return result;
+  return p1->bounding_box().top() - p2->bounding_box().top();
+}
+
 // Sort function to sort a BBC by bounding_box().bottom().
 template<class BBC>
 int SortByBoxBottom(const void* void1, const void* void2) {
@ -859,6 +886,9 @@ void GridSearch<BBC, BBC_CLIST, BBC_C_IT>::RemoveBBox() {

 template<class BBC, class BBC_CLIST, class BBC_C_IT>
 void GridSearch<BBC, BBC_CLIST, BBC_C_IT>::RepositionIterator() {
+  // Something was deleted, so we have little choice but to clear the
+  // returns list.
+  returns_.shallow_clear();
  // Reset the iterator back to one past the previous return.
  // If the previous_return_ is no longer in the list, then
  // next_return_ serves as a backup.
--- a/textord/blobgrid.cpp
+++ b/textord/blobgrid.cpp
@ -0,0 +1,44 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blobgrid.h
+// Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods.
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+// Created:     Sat Jun 11 10:30:01 PST 2011
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "blobgrid.h"
+
+namespace tesseract {
+
+BlobGrid::BlobGrid(int gridsize, const ICOORD& bleft, const ICOORD& tright)
+  : BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>(gridsize, bleft, tright) {
+}
+
+BlobGrid::~BlobGrid() {
+}
+
+// Inserts all the blobs from the given list, with x and y spreading,
+// without removing from the source list, so ownership remains with the
+// source list.
+void BlobGrid::InsertBlobList(BLOBNBOX_LIST* blobs) {
+  BLOBNBOX_IT blob_it(blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.data();
+    if (!blob->joined_to_prev())
+      InsertBBox(true, true, blob);
+  }
+}
+
+
+}  // namespace tesseract.
--- a/textord/blobgrid.h
+++ b/textord/blobgrid.h
@ -0,0 +1,46 @@
+///////////////////////////////////////////////////////////////////////
+// File:        blobgrid.h
+// Description: BBGrid of BLOBNBOX with useful BLOBNBOX-specific methods.
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+// Created:     Sat Jun 11 10:26:01 PST 2011
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+
+#ifndef TESSERACT_TEXTORD_BLOBGRID_H_
+#define TESSERACT_TEXTORD_BLOBGRID_H_
+
+#include "bbgrid.h"
+#include "blobbox.h"
+
+CLISTIZEH(BLOBNBOX)
+
+namespace tesseract {
+
+typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> BlobGridSearch;
+
+class BlobGrid : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
+ public:
+  BlobGrid(int gridsize, const ICOORD& bleft, const ICOORD& tright);
+  virtual ~BlobGrid();
+
+  // Inserts all the blobs from the given list, with x and y spreading,
+  // without removing from the source list, so ownership remains with the
+  // source list.
+  void InsertBlobList(BLOBNBOX_LIST* blobs);
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_TEXTORD_BLOBGRID_H_
--- a/textord/ccnontextdetect.cpp
+++ b/textord/ccnontextdetect.cpp
@ -0,0 +1,310 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ccnontextdetect.cpp
+// Description: Connected-Component-based photo (non-text) detection.
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+// Created:     Sat Jun 11 10:12:01 PST 2011
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "ccnontextdetect.h"
+#include "imagefind.h"
+#include "strokewidth.h"
+
+namespace tesseract {
+
+// Max number of neighbour small objects per squared gridsize before a grid
+// cell becomes image.
+const double kMaxSmallNeighboursPerPix = 1.0 / 32;
+// Max number of small blobs a large blob may overlap before it is rejected
+// and determined to be image.
+const int kMaxLargeOverlapsWithSmall = 3;
+// Max number of small blobs a medium blob may overlap before it is rejected
+// and determined to be image. Larger than for large blobs as medium blobs
+// may be complex Chinese characters. Very large Chinese characters are going
+// to overlap more medium blobs than small.
+const int kMaxMediumOverlapsWithSmall = 12;
+// Max number of normal blobs a large blob may overlap before it is rejected
+// and determined to be image. This is set higher to allow for drop caps, which
+// may overlap a lot of good text blobs.
+const int kMaxLargeOverlapsWithMedium = 12;
+// Multiplier of original noise_count used to test for the case of spreading
+// noise beyond where it should really be.
+const int kOriginalNoiseMultiple = 8;
+// Pixel padding for noise blobs when rendering on the image
+// mask to encourage them to join together. Make it too big and images
+// will fatten out too much and have to be clipped to text.
+const int kNoisePadding = 4;
+// Fraction of max_noise_count_ to be added to the noise count if there is
+// photo mask in the background.
+const double kPhotoOffsetFraction = 0.375;
+// Min ratio of perimeter^2/16area for a "good" blob in estimating noise
+// density. Good blobs are supposed to be highly likely real text.
+// We consider a square to have unit ratio, where A=(p/4)^2, hence the factor
+// of 16. Digital circles are weird and have a minimum ratio of pi/64, not
+// the 1/(4pi) that you would expect.
+const double kMinGoodTextPARatio = 1.5;
+
+CCNonTextDetect::CCNonTextDetect(int gridsize,
+                             const ICOORD& bleft, const ICOORD& tright)
+  : BlobGrid(gridsize, bleft, tright),
+    max_noise_count_(static_cast<int>(kMaxSmallNeighboursPerPix *
+                                      gridsize * gridsize)),
+    noise_density_(NULL) {
+  // TODO(rays) break max_noise_count_ out into an area-proportional
+  // value, as now plus an additive constant for the number of text blobs
+  // in the 3x3 neigbourhood - maybe 9.
+}
+
+CCNonTextDetect::~CCNonTextDetect() {
+  delete noise_density_;
+}
+
+// Creates and returns a Pix with the same resolution as the original
+// in which 1 (black) pixels represent likely non text (photo, line drawing)
+// areas of the page, deleting from the blob_block the blobs that were
+// determined to be non-text.
+// The photo_map is used to bias the decision towards non-text, rather than
+// supplying definite decision.
+// The blob_block is the usual result of connected component analysis,
+// holding the detected blobs.
+// The returned Pix should be PixDestroyed after use.
+Pix* CCNonTextDetect::ComputeNonTextMask(bool debug, Pix* photo_map,
+                                         TO_BLOCK* blob_block) {
+  // Insert the smallest blobs into the grid.
+  InsertBlobList(&blob_block->small_blobs);
+  InsertBlobList(&blob_block->noise_blobs);
+  // Add the medium blobs that don't have a good strokewidth neighbour.
+  // Those that do go into good_grid as an antidote to spreading beyond the
+  // real reaches of a noise region.
+  BlobGrid good_grid(gridsize(), bleft(), tright());
+  BLOBNBOX_IT blob_it(&blob_block->blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.data();
+    double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0;
+    perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area();
+    if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio)
+      InsertBBox(true, true, blob);
+    else
+      good_grid.InsertBBox(true, true, blob);
+  }
+  noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid);
+  good_grid.Clear();  // Not needed any more.
+  Pix* pix = noise_density_->ThresholdToPix(max_noise_count_);
+  if (debug) {
+    pixWrite("junknoisemask.png", pix, IFF_PNG);
+  }
+  ScrollView* win = NULL;
+  if (debug) {
+    win = MakeWindow(0, 400, "Photo Mask Blobs");
+  }
+  // Large and medium blobs are not text if they overlap with "a lot" of small
+  // blobs.
+  MarkAndDeleteNonTextBlobs(&blob_block->large_blobs,
+                            kMaxLargeOverlapsWithSmall,
+                            win, ScrollView::DARK_GREEN, pix);
+  MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall,
+                          win, ScrollView::WHITE, pix);
+  // Clear the grid of small blobs and insert the medium blobs.
+  Clear();
+  InsertBlobList(&blob_block->blobs);
+  MarkAndDeleteNonTextBlobs(&blob_block->large_blobs,
+                            kMaxLargeOverlapsWithMedium,
+                            win, ScrollView::DARK_GREEN, pix);
+  // Clear again before we start deleting the blobs in the grid.
+  Clear();
+  MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1,
+                            win, ScrollView::CORAL, pix);
+  MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1,
+                            win, ScrollView::GOLDENROD, pix);
+  MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1,
+                            win, ScrollView::WHITE, pix);
+  if (debug) {
+    win->Update();
+    pixWrite("junkccphotomask.png", pix, IFF_PNG);
+    delete win->AwaitEvent(SVET_DESTROY);
+    delete win;
+  }
+  return pix;
+}
+
+// Computes and returns the noise_density IntGrid, at the same gridsize as
+// this by summing the number of small elements in a 3x3 neighbourhood of
+// each grid cell. good_grid is filled with blobs that are considered most
+// likely good text, and this is filled with small and medium blobs that are
+// more likely non-text.
+// The photo_map is used to bias the decision towards non-text, rather than
+// supplying definite decision.
+IntGrid* CCNonTextDetect::ComputeNoiseDensity(bool debug, Pix* photo_map,
+                                              BlobGrid* good_grid) {
+  IntGrid* noise_counts = CountCellElements();
+  IntGrid* noise_density = noise_counts->NeighbourhoodSum();
+  IntGrid* good_counts = good_grid->CountCellElements();
+  // Now increase noise density in photo areas, to bias the decision and
+  // minimize hallucinated text on image, but trim the noise_density where
+  // there are good blobs and the original count is low in non-photo areas,
+  // indicating that most of the result came from neighbouring cells.
+  int height = pixGetHeight(photo_map);
+  int photo_offset = IntCastRounded(max_noise_count_ * kPhotoOffsetFraction);
+  for (int y = 0; y < gridheight(); ++y) {
+    for (int x = 0; x < gridwidth(); ++x) {
+      int noise = noise_density->GridCellValue(x, y);
+      if (max_noise_count_ < noise + photo_offset &&
+          noise <= max_noise_count_) {
+        // Test for photo.
+        int left = x * gridsize();
+        int right = left + gridsize();
+        int bottom = height - y * gridsize();
+        int top = bottom - gridsize();
+        if (ImageFind::BoundsWithinRect(photo_map, &left, &top, &right,
+                                        &bottom)) {
+          noise_density->SetGridCell(x, y, noise + photo_offset);
+        }
+      }
+      if (debug && noise > max_noise_count_ &&
+          good_counts->GridCellValue(x, y) > 0) {
+        tprintf("At %d, %d, noise = %d, good=%d, orig=%d, thr=%d\n",
+                x * gridsize(), y * gridsize(),
+                noise_density->GridCellValue(x, y),
+                good_counts->GridCellValue(x, y),
+                noise_counts->GridCellValue(x, y), max_noise_count_);
+      }
+      if (noise > max_noise_count_ &&
+          good_counts->GridCellValue(x, y) > 0 &&
+          noise_counts->GridCellValue(x, y) * kOriginalNoiseMultiple <=
+              max_noise_count_) {
+        noise_density->SetGridCell(x, y, 0);
+      }
+    }
+  }
+  delete noise_counts;
+  delete good_counts;
+  return noise_density;
+}
+
+// Helper to expand a box in one of the 4 directions by the given pad,
+// provided it does not expand into any cell with a zero noise density.
+// If that is not possible, try expanding all round by a small constant.
+static TBOX AttemptBoxExpansion(const TBOX& box, const IntGrid& noise_density,
+                                int pad) {
+  TBOX expanded_box(box);
+  expanded_box.set_right(box.right() + pad);
+  if (!noise_density.AnyZeroInRect(expanded_box))
+    return expanded_box;
+  expanded_box = box;
+  expanded_box.set_left(box.left() - pad);
+  if (!noise_density.AnyZeroInRect(expanded_box))
+    return expanded_box;
+  expanded_box = box;
+  expanded_box.set_top(box.top() + pad);
+  if (!noise_density.AnyZeroInRect(expanded_box))
+    return expanded_box;
+  expanded_box = box;
+  expanded_box.set_bottom(box.bottom() + pad);
+  if (!noise_density.AnyZeroInRect(expanded_box))
+    return expanded_box;
+  expanded_box = box;
+  expanded_box.pad(kNoisePadding, kNoisePadding);
+  if (!noise_density.AnyZeroInRect(expanded_box))
+    return expanded_box;
+  return box;
+}
+
+// Tests each blob in the list to see if it is certain non-text using 2
+// conditions:
+// 1. blob overlaps a cell with high value in noise_density_ (previously set
+// by ComputeNoiseDensity).
+// OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This
+// condition is disabled with max_blob_overlaps == -1.
+// If it does, the blob is declared non-text, and is used to mark up the
+// nontext_mask. Such blobs are fully deleted, and non-noise blobs have their
+// neighbours reset, as they may now point to deleted data.
+// WARNING: The blobs list blobs may be in the *this grid, but they are
+// not removed. If any deleted blobs might be in *this, then this must be
+// Clear()ed immediately after MarkAndDeleteNonTextBlobs is called.
+// If the win is not NULL, deleted blobs are drawn on it in red, and kept
+// blobs are drawn on it in ok_color.
+void CCNonTextDetect::MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST* blobs,
+                                                int max_blob_overlaps,
+                                                ScrollView* win,
+                                                ScrollView::Color ok_color,
+                                                Pix* nontext_mask) {
+  int imageheight = tright().y() - bleft().x();
+  BLOBNBOX_IT blob_it(blobs);
+  BLOBNBOX_LIST dead_blobs;
+  BLOBNBOX_IT dead_it(&dead_blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.data();
+    TBOX box = blob->bounding_box();
+    if (!noise_density_->RectMostlyOverThreshold(box, max_noise_count_) &&
+        (max_blob_overlaps < 0 ||
+            !BlobOverlapsTooMuch(blob, max_blob_overlaps))) {
+      blob->ClearNeighbours();
+      if (win != NULL)
+        blob->plot(win, ok_color, ok_color);
+    } else {
+      if (noise_density_->AnyZeroInRect(box)) {
+        // There is a danger that the bounding box may overlap real text, so
+        // we need to render the outline.
+        Pix* blob_pix = blob->cblob()->render_outline();
+        pixRasterop(nontext_mask, box.left(), imageheight - box.top(),
+                    box.width(), box.height(), PIX_SRC | PIX_DST,
+                    blob_pix, 0, 0);
+        pixDestroy(&blob_pix);
+      } else {
+        if (box.area() < gridsize() * gridsize()) {
+          // It is a really bad idea to make lots of small components in the
+          // photo mask, so try to join it to a bigger area by expanding the
+          // box in a way that does not touch any zero noise density cell.
+          box = AttemptBoxExpansion(box, *noise_density_, gridsize());
+        }
+        // All overlapped cells are non-zero, so just mark the rectangle.
+        pixRasterop(nontext_mask, box.left(), imageheight - box.top(),
+                    box.width(), box.height(), PIX_SET, NULL, 0, 0);
+      }
+      if (win != NULL)
+        blob->plot(win, ScrollView::RED, ScrollView::RED);
+      // It is safe to delete the cblob now, as it isn't used by the grid
+      // or BlobOverlapsTooMuch, and the BLOBNBOXes will go away with the
+      // dead_blobs list.
+      // TODO(rays) delete the delete when the BLOBNBOX destructor deletes
+      // the cblob.
+      delete blob->cblob();
+      dead_it.add_to_end(blob_it.extract());
+    }
+  }
+}
+
+// Returns true if the given blob overlaps more than max_overlaps blobs
+// in the current grid.
+bool CCNonTextDetect::BlobOverlapsTooMuch(BLOBNBOX* blob, int max_overlaps) {
+  // Search the grid to see what intersects it.
+  // Setup a Rectangle search for overlapping this blob.
+  BlobGridSearch rsearch(this);
+  TBOX box = blob->bounding_box();
+  rsearch.StartRectSearch(box);
+  rsearch.SetUniqueMode(true);
+  BLOBNBOX* neighbour;
+  int overlap_count = 0;
+  while (overlap_count <= max_overlaps &&
+         (neighbour = rsearch.NextRectSearch()) != NULL) {
+    if (box.major_overlap(neighbour->bounding_box())) {
+      ++overlap_count;
+      if (overlap_count > max_overlaps)
+        return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace tesseract.
--- a/textord/ccnontextdetect.h
+++ b/textord/ccnontextdetect.h
@ -0,0 +1,87 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ccnontextdetect.h
+// Description: Connected-Component-based non-text detection.
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+// Created:     Sat Jun 11 09:52:01 PST 2011
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TEXTORD_CCPHOTODETECT_H_
+#define TESSERACT_TEXTORD_CCPHOTODETECT_H_
+
+#include "blobgrid.h"
+#include "scrollview.h"
+
+namespace tesseract {
+
+// The CCNonTextDetect class contains grid-based operations on blobs to create
+// a full-resolution image mask analogous yet complementary to
+// pixGenHalftoneMask as it is better at line-drawings, graphs and charts.
+class CCNonTextDetect : public BlobGrid {
+ public:
+  CCNonTextDetect(int gridsize, const ICOORD& bleft, const ICOORD& tright);
+  virtual ~CCNonTextDetect();
+
+  // Creates and returns a Pix with the same resolution as the original
+  // in which 1 (black) pixels represent likely non text (photo, line drawing)
+  // areas of the page, deleting from the blob_block the blobs that were
+  // determined to be non-text.
+  // The photo_map (binary image mask) is used to bias the decision towards
+  // non-text, rather than supplying a definite decision.
+  // The blob_block is the usual result of connected component analysis,
+  // holding the detected blobs.
+  // The returned Pix should be PixDestroyed after use.
+  Pix* ComputeNonTextMask(bool debug, Pix* photo_map, TO_BLOCK* blob_block);
+
+ private:
+  // Computes and returns the noise_density IntGrid, at the same gridsize as
+  // this by summing the number of small elements in a 3x3 neighbourhood of
+  // each grid cell. good_grid is filled with blobs that are considered most
+  // likely good text, and this is filled with small and medium blobs that are
+  // more likely non-text.
+  // The photo_map is used to bias the decision towards non-text, rather than
+  // supplying definite decision.
+  IntGrid* ComputeNoiseDensity(bool debug, Pix* photo_map, BlobGrid* good_grid);
+
+  // Tests each blob in the list to see if it is certain non-text using 2
+  // conditions:
+  // 1. blob overlaps a cell with high value in noise_density_ (previously set
+  // by ComputeNoiseDensity).
+  // OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This
+  // condition is disabled with max_blob_overlaps == -1.
+  // If it does, the blob is declared non-text, and is used to mark up the
+  // nontext_mask. Such blobs are fully deleted, and non-noise blobs have their
+  // neighbours reset, as they may now point to deleted data.
+  // WARNING: The blobs list blobs may be in the *this grid, but they are
+  // not removed. If any deleted blobs might be in *this, then this must be
+  // Clear()ed immediately after MarkAndDeleteNonTextBlobs is called.
+  // If the win is not NULL, deleted blobs are drawn on it in red, and kept
+  void MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST* blobs,
+                                 int max_blob_overlaps,
+                                 ScrollView* win, ScrollView::Color ok_color,
+                                 Pix* nontext_mask);
+  // Returns true if the given blob overlaps more than max_overlaps blobs
+  // in the current grid.
+  bool BlobOverlapsTooMuch(BLOBNBOX* blob, int max_overlaps);
+
+  // Max entry in noise_density_ before the cell is declared noisy.
+  int max_noise_count_;
+  // Completed noise density map, which we keep around to use for secondary
+  // noise detection.
+  IntGrid* noise_density_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_TEXTORD_CCPHOTODETECT_H_
--- a/textord/cjkpitch.cpp
+++ b/textord/cjkpitch.cpp
--- a/textord/cjkpitch.h
+++ b/textord/cjkpitch.h
@ -0,0 +1,72 @@
+///////////////////////////////////////////////////////////////////////
+// File:        cjkpitch.h
+// Description: Code to determine fixed pitchness and the pitch if fixed,
+//              for CJK text.
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: takenaka@google.com (Hiroshi Takenaka)
+// Created:     Mon Jun 27 12:48:35 JST 2011
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+#ifndef CJKPITCH_H_
+#define CJKPITCH_H_
+
+#include          "blobbox.h"
+#include          "notdll.h"
+
+// Function to test "fixed-pitchness" of the input text and estimating
+// character pitch parameters for it, based on CJK fixed-pitch layout
+// model.
+//
+// This function assumes that a fixed-pitch CJK text has following
+// characteristics:
+//
+// - Most glyphs are designed to fit within the same sized square
+//   (imaginary body). Also they are aligned to the center of their
+//   imaginary bodies.
+// - The imaginary body is always a regular rectangle.
+// - There may be some extra space between character bodies
+//   (tracking).
+// - There may be some extra space after punctuations.
+// - The text is *not* space-delimited. Thus spaces are rare.
+// - Character may consists of multiple unconnected blobs.
+//
+// And the function works in two passes.  On pass 1, it looks for such
+// "good" blobs that has the pitch same pitch on the both side and
+// looks like a complete CJK character. Then estimates the character
+// pitch for every row, based on those good blobs. If we couldn't find
+// enough good blobs for a row, then the pitch is estimated from other
+// rows with similar character height instead.
+//
+// Pass 2 is an iterative process to fit the blobs into fixed-pitch
+// character cells. Once we have estimated the character pitch, blobs
+// that are almost as large as the pitch can be considered to be
+// complete characters. And once we know that some characters are
+// complete characters, we can estimate the region occupied by its
+// neighbors. And so on.
+//
+// We repeat the process until all ambiguities are resolved. Then make
+// the final decision about fixed-pitchness of each row and compute
+// pitch and spacing parameters.
+//
+// (If a row is considered to be propotional, pitch_decision for the
+// row is set to PITCH_CORR_PROP and the later phase
+// (i.e. Textord::to_spacing()) should determine its spacing
+// parameters)
+//
+// This function doesn't provide all information required by
+// fixed_pitch_words() and the rows need to be processed with
+// make_prop_words() even if they are fixed pitched.
+void compute_fixed_pitch_cjk(ICOORD page_tr,               // top right
+                             TO_BLOCK_LIST *port_blocks);  // input list
+
+#endif  // CJKPITCH_H_
--- a/textord/colfind.cpp
+++ b/textord/colfind.cpp
--- a/textord/colfind.h
+++ b/textord/colfind.h
@ -25,25 +25,28 @@
 #include "colpartitiongrid.h"
 #include "colpartitionset.h"
 #include "ocrblock.h"
+#include "textlineprojection.h"

-class ScrollView;
-class TO_BLOCK;
-class STATS;
 class BLOCK_LIST;
 struct Boxa;
 struct Pixa;
+class DENORM;
+class ScrollView;
+class STATS;
+class TO_BLOCK;

 namespace tesseract {

 extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection");

-class StrokeWidth;
-class LineSpacing;
-class TempColumn_LIST;
-class ColSegment_LIST;
-class ColumnGroup_LIST;
 class ColPartitionSet;
 class ColPartitionSet_LIST;
+class ColSegment_LIST;
+class ColumnGroup_LIST;
+class LineSpacing;
+class StrokeWidth;
+class TempColumn_LIST;
+class EquationDetectBase;

 // The ColumnFinder class finds columns in the grid.
 class ColumnFinder : public TabFind {
@ -59,25 +62,54 @@ class ColumnFinder : public TabFind {
               int vertical_x, int vertical_y);
  virtual ~ColumnFinder();

+  // Accessors for testing
+  const DENORM* denorm() const {
+    return denorm_;
+  }
+  const TextlineProjection* projection() const {
+    return &projection_;
+  }
+
  // ======================================================================
  // The main function of ColumnFinder is broken into pieces to facilitate
  // optional insertion of orientation and script detection in an efficient
  // way. The calling sequence IS MANDATORY however, whether or not
  // OSD is being used:
  // 1. Construction.
-  // 2. IsVerticallyAlignedText.
-  // 3. CorrectOrientation.
-  // 4. FindBlocks.
-  // 5. Destruction. Use of a single column finder for multiple images does not
+  // 2. SetupAndFilterNoise.
+  // 3. IsVerticallyAlignedText.
+  // 4. CorrectOrientation.
+  // 5. FindBlocks.
+  // 6. Destruction. Use of a single column finder for multiple images does not
  //    make sense.
+  // Throughout these steps, the ColPartitions are owned by part_grid_, which
+  // means that that it must be kept correct. Exception: big_parts_ owns its
+  // own ColPartitions.
+  // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except
+  // for a phase in FindBlocks before TransformToBlocks, when they become
+  // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX
+  // indicates more of a betrothal for the majority of layout analysis, ie
+  // which ColPartition will take ownership when the blobs are release from
+  // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that
+  // are part of the image regions, as they are not on any TO_BLOCK list.
  // TODO(rays) break up column finder further into smaller classes, as
  // there is a lot more to it than column finding now.
  // ======================================================================

-  // Tests for vertical alignment of text (returning true if so), and
-  // generates a list of blobs for orientation and script detection. Note that
-  // the vertical alignment may be due to text whose writing direction is
-  // vertical, like say Japanese, or due to text whose writing direction is
+  // Performs initial processing on the blobs in the input_block:
+  // Setup the part_grid, stroke_width_, nontext_map_.
+  // Obvious noise blobs are filtered out and used to mark the nontext_map_.
+  // Initial stroke-width analysis is used to get local text alignment
+  // direction, so the textline projection_ map can be setup.
+  // On return, IsVerticallyAlignedText may be called (now optionally) to
+  // determine the gross textline alignment of the page.
+  void SetupAndFilterNoise(Pix* photo_mask_pix, TO_BLOCK* input_block);
+
+  // Tests for vertical alignment of text (returning true if so), and generates
+  // a list of blobs (in osd_blobs) for orientation and script detection.
+  // block is the single block for the whole page or rectangle to be OCRed.
+  // Note that the vertical alignment may be due to text whose writing direction
+  // is vertical, like say Japanese, or due to text whose writing direction is
  // horizontal but whose text appears vertically aligned because the image is
  // not the right way up.
  bool IsVerticallyAlignedText(TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
@ -96,25 +128,32 @@ class ColumnFinder : public TabFind {
  void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines,
                          int recognition_rotation);

-  // Finds the text and image blocks, returning them in the blocks and to_blocks
-  // lists. (Each TO_BLOCK points to the basic BLOCK and adds more information.)
-  // If boxa and pixa are not NULL, they are assumed to be the output of
-  // ImageFinder::FindImages, and are used to generate image blocks.
-  // The input boxa and pixa are destroyed.
-  // Imageheight should be the pixel height of the original image.
-  // The input block is the result of a call to find_components, and contains
-  // the blobs found in the image. These blobs will be removed and placed
-  // in the output blocks, while unused ones will be deleted.
+  // Finds blocks of text, image, rule line, table etc, returning them in the
+  // blocks and to_blocks
+  // (Each TO_BLOCK points to the basic BLOCK and adds more information.)
+  // Image blocks are generated by a combination of photo_mask_pix (which may
+  // NOT be NULL) and the rejected text found during preliminary textline
+  // finding.
+  // The input_block is the result of a call to find_components, and contains
+  // the blobs found in the image or rectangle to be OCRed. These blobs will be
+  // removed and placed in the output blocks, while unused ones will be deleted.
  // If single_column is true, the input is treated as single column, but
  // it is still divided into blocks of equal line spacing/text size.
-  // Returns -1 if the user requested retry with more debug info.
-  int FindBlocks(bool single_column, int imageheight,
-                 TO_BLOCK* block, Boxa* boxa, Pixa* pixa,
+  // scaled_color is scaled down by scaled_factor from the input color image,
+  // and may be NULL if the input was not color.
+  // Returns -1 if the user hits the 'd' key in the blocks window while running
+  // in debug mode, which requests a retry with more debug info.
+  int FindBlocks(bool single_column,
+                 Pix* scaled_color, int scaled_factor,
+                 TO_BLOCK* block, Pix* photo_mask_pix,
                 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);

  // Get the rotation required to deskew, and its inverse rotation.
  void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew);

+  // Set the equation detection pointer.
+  void SetEquationDetect(EquationDetectBase* detect);
+
 private:
  // Displays the blob and block bounding boxes in a window called Blocks.
  void DisplayBlocks(BLOCK_LIST* blocks);
@ -122,75 +161,11 @@ class ColumnFinder : public TabFind {
  // best_columns_.
  void DisplayColumnBounds(PartSetVector* sets);

-  // Converts the arrays of Box/Pix to a list of C_OUTLINE, and then to blobs.
-  // The output is a list of C_BLOBs for the images, but the C_OUTLINEs
-  // contain no data.
-  void ExtractImageBlobs(int image_height, Boxa* boxa, Pixa* pixa);
-
-  ////// Functions involved in making the initial ColPartitions. /////
-
-  // Creates the initial ColPartitions, and puts them in a ColPartitionSet
-  // for each grid y coordinate, storing the ColPartitionSets in part_sets_.
-  // After creating the ColPartitonSets, attempts to merge them where they
-  // overlap and unique the BLOBNBOXes within.
-  // The return value is the number of ColPartitionSets made.
-  int MakeColumnPartitions();
-  // Partition the BLOBNBOXES horizontally at the given grid y, creating a
-  // ColPartitionSet which is returned. NULL is returned if there are no
-  // BLOBNBOXES at the given grid y.
-  ColPartitionSet* PartitionsAtGridY(int grid_y);
-  // Insert the blobs in the given list into the main grid and for
-  // each one also make it a separate unknown partition.
-  // If filter is true, use only the blobs that are above a threshold in
-  // size or a non-isolated.
-  void InsertSmallBlobsAsUnknowns(bool filter, BLOBNBOX_LIST* blobs);
-  // Helper function for PartitionsAtGridY, with a long argument list.
-  // This bbox is of unknown type, so it is added to an unk_partition.
-  // If the edge is past the unk_right_margin then unk_partition has to be
-  // completed and a new one made. See CompletePartition and StartPartition
-  // for the other args.
-  void ProcessUnknownBlob(int page_edge, BLOBNBOX* bbox,
-                          ColPartition** unk_partition,
-                          ColPartition_IT* unk_part_it,
-                          TabVector** unk_right_line,
-                          int* unk_right_margin,
-                          int* unk_prev_margin,
-                          bool* unk_edge_is_left);
-  // Creates and returns a new ColPartition of the given start_type
-  // and adds the given bbox to it.
-  // Also finds the left and right tabvectors that bound the textline, setting
-  // the members of the returned ColPartition appropriately:
-  // If the left tabvector is less constraining than the input left_margin
-  // (assumed to be the right edge of the previous partition), then the
-  // tabvector is ignored and the left_margin used instead.
-  // If the right tabvector is more constraining than the input *right_margin,
-  // (probably the right edge of the page), then the *right_margin is adjusted
-  // to use the tabvector.
-  // *edge_is_left is set to true if the right tabvector is good and used as the
-  // margin, so we can include blobs that overhang the tabvector in this
-  // partition.
-  ColPartition* StartPartition(BlobRegionType start_type, int left_margin,
-                               BLOBNBOX* bbox, TabVector** right_line,
-                               int* right_margin, bool* edge_is_left);
-  // Completes the given partition, and adds it to the given iterator.
-  // The right_margin on input is the left edge of the next blob if there is
-  // one. The right tab vector plus a margin is used as the right margin if
-  // it is more constraining than the next blob, but if there are no more
-  // blobs, we want the right margin to make it to the page edge.
-  // The return value is the next left margin, being the right edge of the
-  // bounding box of blobs.
-  int CompletePartition(bool no_more_blobs, int page_edge,
-                        TabVector* right_line, int* right_margin,
-                        ColPartition** partition, ColPartition_IT* part_it);
-
-
  ////// Functions involved in determining the columns used on the page. /////

-  // Makes an ordered list of candidates to partition the width of the page
-  // into columns using the part_sets_.
-  // See AddToColumnSetsIfUnique for the ordering.
-  // If single_column, then it just makes a single page-wide fake column.
-  void MakeColumnCandidates(bool single_column);
+  // Sets up column_sets_ (the determined column layout at each horizontal
+  // slice). Returns false if the page is empty.
+  bool MakeColumns(bool single_column);
  // Attempt to improve the column_candidates by expanding the columns
  // and adding new partitions from the partition sets in src_sets.
  // Src_sets may be equal to column_candidates, in which case it will
@ -201,10 +176,10 @@ class ColumnFinder : public TabFind {
  void PrintColumnCandidates(const char* title);
  // Finds the optimal set of columns that cover the entire image with as
  // few changes in column partition as possible.
-  void AssignColumns();
+  void AssignColumns(const PartSetVector& part_sets);
  // Finds the biggest range in part_sets_ that has no assigned column, but
  // column assignment is possible.
-  bool BiggestUnassignedRange(const bool* any_columns_possible,
+  bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible,
                              int* start, int* end);
  // Finds the modal compatible column_set_ index within the given range.
  int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs,
@ -236,19 +211,21 @@ class ColumnFinder : public TabFind {
  //////// Functions that manipulate ColPartitions in the part_grid_ /////
  //////// to split, merge, find margins, and find types.  //////////////

-  // Removes the ColPartitions from part_sets_, the ColPartitionSets that
-  // contain them, and puts them in the part_grid_ after ensuring that no
-  // BLOBNBOX is owned by more than one of them.
-  void MovePartitionsToGrid();
+  // Hoovers up all un-owned blobs and deletes them.
+  // The rest get released from the block so the ColPartitions can pass
+  // ownership to the output blocks.
+  void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block);
  // Splits partitions that cross columns where they have nothing in the gap.
  void GridSplitPartitions();
  // Merges partitions where there is vertical overlap, within a single column,
  // and the horizontal gap is small enough.
  void GridMergePartitions();
-  // Resolves unknown partitions from the unknown_parts_ list by merging them
-  // with a close neighbour, inserting them into the grid with a known type,
-  // or declaring them to be noise.
-  void GridInsertUnknowns();
+  // Inserts remaining noise blobs into the most applicable partition if any.
+  // If there is no applicable partition, then the blobs are deleted.
+  void InsertRemainingNoise(TO_BLOCK* block);
+  // Remove partitions that come from horizontal lines that look like
+  // underlines, but are not part of a table.
+  void GridRemoveUnderlinePartitions();
  // Add horizontal line separators as partitions.
  void GridInsertHLinePartitions();
  // Add vertical line separators as partitions.
@ -272,22 +249,34 @@ class ColumnFinder : public TabFind {
  // Transform the grid of partitions to the output blocks.
  void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);

+  // Reflect the blob boxes (but not the outlines) in the y-axis so that
+  // the blocks get created in the correct RTL order. Rotates the blobs
+  // in the input_block and the bblobs list.
+  // The reflection is undone in RotateAndReskewBlocks by
+  // reflecting the blocks themselves, and then recomputing the blob bounding
+  //  boxes.
+  void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs);
+
  // Undo the deskew that was done in FindTabVectors, as recognition is done
  // without correcting blobs or blob outlines for skew.
  // Reskew the completed blocks to put them back to the original rotated coords
  // that were created by CorrectOrientation.
+  // If the input_is_rtl, then reflect the blocks in the y-axis to undo the
+  // reflection that was done before FindTabVectors.
  // Blocks that were identified as vertical text (relative to the rotated
  // coordinates) are further rotated so the text lines are horizontal.
  // blob polygonal outlines are rotated to match the position of the blocks
  // that they are in, and their bounding boxes are recalculated to be accurate.
  // Record appropriate inverse transformations and required
  // classifier transformation in the blocks.
-  void RotateAndReskewBlocks(TO_BLOCK_LIST* to_blocks);
+  void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks);

-
-  // Move all the small and noise blobs into the main blobs list of
-  // the block from the to_blocks list that contains them.
-  void MoveSmallBlobs(BLOBNBOX_LIST* bblobs, TO_BLOCK_LIST* to_blocks);
+  // Computes the rotations for the block (to make textlines horizontal) and
+  // for the blobs (for classification) and sets the appropriate members
+  // of the given block.
+  // Returns the rotation that needs to be applied to the blobs to make
+  // them sit in the rotated block.
+  FCOORD ComputeBlockAndClassifyRotation(BLOCK* block);

  // The minimum gutter width to apply for finding columns.
  // Modified when vertical text is detected to prevent detection of
@ -305,9 +294,6 @@ class ColumnFinder : public TabFind {
  FCOORD rerotate_;
  // The additional rotation vector needed to rotate text for recognition.
  FCOORD text_rotation_;
-  // The part_sets_ are the initial text-line-like partition of the grid,
-  // and is a vector of ColPartitionSets.
-  PartSetVector part_sets_;
  // The column_sets_ contain the ordered candidate ColPartitionSets that
  // define the possible divisions of the page into columns.
  PartSetVector column_sets_;
@ -322,14 +308,31 @@ class ColumnFinder : public TabFind {
  // turned into regions, but are kept around because they are referenced
  // by the part_grid_.
  ColPartition_LIST good_parts_;
-  // List of ColPartitions of unknown type.
-  ColPartition_LIST unknown_parts_;
+  // List of ColPartitions that are big and might be dropcap or vertically
+  // joined.
+  ColPartition_LIST big_parts_;
  // List of ColPartitions that have been declared noise.
  ColPartition_LIST noise_parts_;
-  // The fake blobs that are made from the input boxa/pixa pair.
+  // The fake blobs that are made from the images.
  BLOBNBOX_LIST image_bblobs_;
  // Horizontal line separators.
  TabVector_LIST horizontal_lines_;
+  // Image map of photo/noise areas on the page.
+  Pix* nontext_map_;
+  // Textline projection map.
+  TextlineProjection projection_;
+  // Sequence of DENORMS that indicate how to get back to the original image
+  // coordinate space. The destructor must delete all the DENORMs in the chain.
+  DENORM* denorm_;
+
+  // Various debug windows that automatically go away on completion.
+  ScrollView* input_blobs_win_;
+
+  // The equation region detector pointer. Note: This pointer is passed in by
+  // member function SetEquationDetect, and releasing it is NOT owned by this
+  // class.
+  EquationDetectBase* equation_detect_;
+
  // Allow a subsequent instance to reuse the blocks window.
  // Not thread-safe, but multiple threads shouldn't be using windows anyway.
  static ScrollView* blocks_win_;
--- a/textord/colpartition.cpp
+++ b/textord/colpartition.cpp
--- a/textord/colpartition.h
+++ b/textord/colpartition.h
@ -90,7 +90,18 @@ class ColPartition : public ELIST2_LINK {
  // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
  // the ColPartition owns the BLOBNBOX!!!
  // Call DeleteBoxes before deleting the ColPartition.
-  static ColPartition* FakePartition(const TBOX& box);
+  static ColPartition* FakePartition(const TBOX& box,
+                                     PolyBlockType block_type,
+                                     BlobRegionType blob_type,
+                                     BlobTextFlowType flow);
+
+  // Constructs and returns a ColPartition with the given real BLOBNBOX,
+  // and sets it up to be a "big" partition (single-blob partition bigger
+  // than the surrounding text that may be a dropcap, two or more vertically
+  // touching characters, or some graphic element.
+  // If the given list is not NULL, the partition is also added to the list.
+  static ColPartition* MakeBigPartition(BLOBNBOX* box,
+                                        ColPartition_LIST* big_part_list);

  ~ColPartition();

@ -116,6 +127,12 @@ class ColPartition : public ELIST2_LINK {
  int median_bottom() const {
    return median_bottom_;
  }
+  int median_left() const {
+    return median_left_;
+  }
+  int median_right() const {
+    return median_right_;
+  }
  int median_size() const {
    return median_size_;
  }
@ -185,6 +202,12 @@ class ColPartition : public ELIST2_LINK {
  void set_working_set(WorkingPartSet* working_set) {
    working_set_ = working_set;
  }
+  bool block_owned() const {
+    return block_owned_;
+  }
+  void set_block_owned(bool owned) {
+    block_owned_ = owned;
+  }
  bool desperately_merged() const {
    return desperately_merged_;
  }
@ -342,33 +365,39 @@ class ColPartition : public ELIST2_LINK {
  bool HOverlaps(const ColPartition& other) const {
    return bounding_box_.x_overlap(other.bounding_box_);
  }
-  // Returns true if this and other can be combined without putting a
-  // horizontal step in either left or right edge.
-  bool HCompatible(const ColPartition& other) const {
-    return left_margin_ <= other.bounding_box_.left() &&
-           bounding_box_.left() >= other.left_margin_ &&
-           bounding_box_.right() <= other.right_margin_ &&
-           right_margin_ >= other.bounding_box_.right();
+  // Returns true if this and other's bounding boxes overlap vertically.
+  // TODO(rays) Make HOverlaps and VOverlaps truly symmetric.
+  bool VOverlaps(const ColPartition& other) const {
+    return bounding_box_.y_gap(other.bounding_box_) < 0;
  }
  // Returns the vertical overlap (by median) of this and other.
  // WARNING! Only makes sense on horizontal partitions!
-  int VOverlap(const ColPartition& other) const {
+  int VCoreOverlap(const ColPartition& other) const {
    return MIN(median_top_, other.median_top_) -
           MAX(median_bottom_, other.median_bottom_);
  }
  // Returns the horizontal overlap (by median) of this and other.
  // WARNING! Only makes sense on vertical partitions!
-  int HOverlap(const ColPartition& other) const {
+  int HCoreOverlap(const ColPartition& other) const {
    return MIN(median_right_, other.median_right_) -
           MAX(median_left_, other.median_left_);
  }
  // Returns true if this and other overlap significantly vertically.
-  bool VOverlaps(const ColPartition& other) const {
-    int overlap = VOverlap(other);
+  // WARNING! Only makes sense on horizontal partitions!
+  bool VSignificantCoreOverlap(const ColPartition& other) const {
+    int overlap = VCoreOverlap(other);
    int height = MIN(median_top_ - median_bottom_,
                     other.median_top_ - other.median_bottom_);
    return overlap * 3 > height;
  }
+  // Returns true if this and other can be combined without putting a
+  // horizontal step in either left or right edge of the resulting block.
+  bool WithinSameMargins(const ColPartition& other) const {
+    return left_margin_ <= other.bounding_box_.left() &&
+           bounding_box_.left() >= other.left_margin_ &&
+           bounding_box_.right() <= other.right_margin_ &&
+           right_margin_ >= other.bounding_box_.right();
+  }
  // Returns true if the region types (aligned_text_) match.
  // Lines never match anything, as they should never be merged or chained.
  bool TypesMatch(const ColPartition& other) const {
@ -379,6 +408,13 @@ class ColPartition : public ELIST2_LINK {
           !BLOBNBOX::IsLineType(type1) && !BLOBNBOX::IsLineType(type2);
  }

+  // Returns true if the types are similar to each other.
+  static bool TypesSimilar(PolyBlockType type1, PolyBlockType type2) {
+    return (type1 == type2 ||
+            (type1 == PT_FLOWING_TEXT && type2 == PT_INLINE_EQUATION) ||
+            (type2 == PT_FLOWING_TEXT && type1 == PT_INLINE_EQUATION));
+  }
+
  // Returns true if partitions is of horizontal line type
  bool IsLineType() const {
    return PTIsLineType(type_);
@ -430,8 +466,8 @@ class ColPartition : public ELIST2_LINK {
  TBOX BoundsWithoutBox(BLOBNBOX* box);

  // Claims the boxes in the boxes_list by marking them with a this owner
-  // pointer. If a box is already owned, then run Unique on it.
-  void ClaimBoxes(WidthCallback* cb);
+  // pointer.
+  void ClaimBoxes();

  // NULL the owner of the blobs in this partition, so they can be deleted
  // independently of the ColPartition.
@ -440,6 +476,12 @@ class ColPartition : public ELIST2_LINK {
  // Delete the boxes that this partition owns.
  void DeleteBoxes();

+  // Reflects the partition in the y-axis, assuming that its blobs have
+  // already been done. Corrects only a limited part of the members, since
+  // this function is assumed to be used shortly after initial creation, which
+  // is before a lot of the members are used.
+  void ReflectInYAxis();
+
  // Returns true if this is a legal partition - meaning that the conditions
  // left_margin <= bounding_box left
  // left_key <= bounding box left key
@ -451,6 +493,9 @@ class ColPartition : public ELIST2_LINK {
  // Returns true if the left and right edges are approximately equal.
  bool MatchingColumns(const ColPartition& other) const;

+  // Returns true if the colors match for two text partitions.
+  bool MatchingTextColor(const ColPartition& other) const;
+
  // Returns true if the sizes match for two text partitions,
  // taking orientation into account
  bool MatchingSizes(const ColPartition& other) const;
@ -482,6 +527,19 @@ class ColPartition : public ELIST2_LINK {
  // Returns the right rule line x coord of the rightmost blob.
  int RightBlobRule() const;

+  // Returns the density value for a particular BlobSpecialTextType.
+  float SpecialBlobsDensity(const BlobSpecialTextType type) const;
+  // Returns the number of blobs for a  particular BlobSpecialTextType.
+  int SpecialBlobsCount(const BlobSpecialTextType type);
+  // Set the density value for a particular BlobSpecialTextType, should ONLY be
+  // used for debugging or testing. In production code, use
+  // ComputeSpecialBlobsDensity instead.
+  void SetSpecialBlobsDensity(
+      const BlobSpecialTextType type, const float density);
+  // Compute the SpecialTextType density of blobs, where we assume
+  // that the SpecialTextType in the boxes_ has been set.
+  void ComputeSpecialBlobsDensity();
+
  // Add a partner above if upper, otherwise below.
  // Add them uniquely and keep the list sorted by box left.
  // Partnerships are added symmetrically to partner and this.
@ -496,9 +554,6 @@ class ColPartition : public ELIST2_LINK {
  // Merge with the other partition and delete it.
  void Absorb(ColPartition* other, WidthCallback* cb);

-  // Shares out any common boxes amongst the partitions, ensuring that no
-  // box stays in both. Returns true if anything was done.
-  bool Unique(ColPartition* other, WidthCallback* cb);
  // Returns true if the overlap between this and the merged pair of
  // merge candidates is sufficiently trivial to be allowed.
  // The merged box can graze the edge of this by the ok_box_overlap
@ -551,10 +606,20 @@ class ColPartition : public ELIST2_LINK {
  // Leader detection is limited to sequences of identical width objects,
  // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
  bool MarkAsLeaderIfMonospaced();
+  // Given the result of TextlineProjection::EvaluateColPartition, (positive for
+  // horizontal text, negative for vertical text, and near zero for non-text),
+  // sets the blob_type_ and flow_ for this partition to indicate whether it
+  // is strongly or weakly vertical or horizontal text, or non-text.
+  void SetRegionAndFlowTypesFromProjectionValue(int value);

-  // Sets all blobs with the partition blob type and flow.
+  // Sets all blobs with the partition blob type and flow, but never overwrite
+  // leader blobs, as we need to be able to identify them later.
  void SetBlobTypes();

+  // Returns true if a decent baseline can be fitted through the blobs.
+  // Works for both horizontal and vertical text.
+  bool HasGoodBaseline();
+
  // Adds this ColPartition to a matching WorkingPartSet if one can be found,
  // otherwise starts a new one in the appropriate column, ending the previous.
  void AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
@ -579,6 +644,13 @@ class ColPartition : public ELIST2_LINK {
                             ColPartition_LIST* block_parts,
                             ColPartition_LIST* used_parts);

+  // Constructs a block from the given list of vertical text partitions.
+  // Currently only creates rectangular blocks.
+  static TO_BLOCK* MakeVerticalTextBlock(const ICOORD& bleft,
+                                         const ICOORD& tright,
+                                         ColPartition_LIST* block_parts,
+                                         ColPartition_LIST* used_parts);
+

  // Returns a copy of everything except the list of boxes. The resulting
  // ColPartition is only suitable for keeping in a column candidate list.
@ -769,6 +841,8 @@ class ColPartition : public ELIST2_LINK {
  ColPartition_CLIST lower_partners_;
  // The WorkingPartSet it lives in while blocks are being made.
  WorkingPartSet* working_set_;
+  // Flag is true when AddBox is sorting vertically, false otherwise.
+  bool last_add_was_vertical_;
  // True when the partition's ownership has been taken from the grid and
  // placed in a working set, or, after that, in the good_parts_ list.
  bool block_owned_;
@ -809,6 +883,8 @@ class ColPartition : public ELIST2_LINK {
  uinT8 color1_[kRGBRMSColors];
  uinT8 color2_[kRGBRMSColors];
  bool owns_blobs_;  // Does the partition own its blobs?
+  // The density of special blobs.
+  float special_blobs_densities_[BSTT_COUNT];
 };

 // Typedef it now in case it becomes a class later.
--- a/textord/colpartitiongrid.cpp
+++ b/textord/colpartitiongrid.cpp
--- a/textord/colpartitiongrid.h
+++ b/textord/colpartitiongrid.h
@ -22,6 +22,7 @@

 #include "bbgrid.h"
 #include "colpartition.h"
+#include "colpartitionset.h"

 namespace tesseract {

@ -36,11 +37,32 @@ class ColPartitionGrid : public BBGrid<ColPartition,
  ColPartitionGrid();
  ColPartitionGrid(int gridsize, const ICOORD& bleft, const ICOORD& tright);

-  ~ColPartitionGrid();
+  virtual ~ColPartitionGrid();

  // Handles a click event in a display window.
  void HandleClick(int x, int y);

+  // Merges ColPartitions in the grid that look like they belong in the same
+  // textline.
+  // For all partitions in the grid, calls the box_cb permanent callback
+  // to compute the search box, seaches the box, and if a candidate is found,
+  // calls the confirm_cb to check any more rules. If the confirm_cb returns
+  // true, then the partitions are merged.
+  // Both callbacks are deleted before returning.
+  void Merges(TessResultCallback2<bool, ColPartition*, TBOX*>* box_cb,
+              TessResultCallback2<bool, const ColPartition*,
+                                  const ColPartition*>* confirm_cb);
+
+  // For the given partition, calls the box_cb permanent callback
+  // to compute the search box, searches the box, and if a candidate is found,
+  // calls the confirm_cb to check any more rules. If the confirm_cb returns
+  // true, then the partitions are merged.
+  // Returns true if the partition is consumed by one or more merges.
+  bool MergePart(TessResultCallback2<bool, ColPartition*, TBOX*>* box_cb,
+                 TessResultCallback2<bool, const ColPartition*,
+                                     const ColPartition*>* confirm_cb,
+                 ColPartition* part);
+
  // Finds all the ColPartitions in the grid that overlap with the given
  // box and returns them SortByBoxLeft(ed) and uniqued in the given list.
  // Any partition equal to not_this (may be NULL) is excluded.
@ -59,6 +81,68 @@ class ColPartitionGrid : public BBGrid<ColPartition,
                          const ColPartition*>* confirm_cb,
      int* overlap_increase);

+  // Split partitions where it reduces overlap between their bounding boxes.
+  // ColPartitions are after all supposed to be a partitioning of the blobs
+  // AND of the space on the page!
+  // Blobs that cause overlaps get removed, put in individual partitions
+  // and added to the big_parts list. They are most likely characters on
+  // 2 textlines that touch, or something big like a dropcap.
+  void SplitOverlappingPartitions(ColPartition_LIST* big_parts);
+
+  // Filters partitions of source_type by looking at local neighbours.
+  // Where a majority of neighbours have a text type, the partitions are
+  // changed to text, where the neighbours have image type, they are changed
+  // to image, and partitions that have no definite neighbourhood type are
+  // left unchanged.
+  // im_box and rerotation are used to map blob coordinates onto the
+  // nontext_map, which is used to prevent the spread of text neighbourhoods
+  // into images.
+  // Returns true if anything was changed.
+  bool GridSmoothNeighbours(BlobTextFlowType source_type, Pix* nontext_map,
+                            const TBOX& im_box, const FCOORD& rerotation);
+
+  // Compute the mean RGB of the light and dark pixels in each ColPartition
+  // and also the rms error in the linearity of color.
+  void ComputePartitionColors(Pix* scaled_color, int scaled_factor,
+                              const FCOORD& rerotation);
+
+  // Reflects the grid and its colpartitions in the y-axis, assuming that
+  // all blob boxes have already been done.
+  void ReflectInYAxis();
+
+  // Rotates the grid and its colpartitions by the given angle, assuming that
+  // all blob boxes have already been done.
+  void Deskew(const FCOORD& deskew);
+
+  // Sets the left and right tabs of the partitions in the grid.
+  void SetTabStops(TabFind* tabgrid);
+
+  // Makes the ColPartSets and puts them in the PartSetVector ready
+  // for finding column bounds. Returns false if no partitions were found.
+  // Each ColPartition in the grid is placed in a single ColPartSet based
+  // on the bottom-left of its bounding box.
+  bool MakeColPartSets(PartSetVector* part_sets);
+
+  // Makes a single ColPartitionSet consisting of a single ColPartition that
+  // represents the total horizontal extent of the significant content on the
+  // page. Used for the single column setting in place of automatic detection.
+  // Returns NULL if the page is empty of significant content.
+  ColPartitionSet* MakeSingleColumnSet(WidthCallback* cb);
+
+  // Mark the BLOBNBOXes in each partition as being owned by that partition.
+  void ClaimBoxes();
+
+  // Retypes all the blobs referenced by the partitions in the grid.
+  // Image blobs are sliced on the grid boundaries to give the tab finder
+  // a better handle on the edges of the images, and the actual blobs are
+  // returned in the im_blobs list, as they are not owned by the block.
+  void ReTypeBlobs(BLOBNBOX_LIST* im_blobs);
+
+  // The boxes within the partitions have changed (by deskew) so recompute
+  // the bounds of all the partitions and reinsert them into the grid.
+  void RecomputeBounds(int gridsize, const ICOORD& bleft,
+                       const ICOORD& tright, const ICOORD& vertical);
+
  // Improves the margins of the ColPartitions in the grid by calling
  // FindPartitionMargins on each.
  void GridFindMargins(ColPartitionSet** best_columns);
@ -68,6 +152,13 @@ class ColPartitionGrid : public BBGrid<ColPartition,
  void ListFindMargins(ColPartitionSet** best_columns,
                       ColPartition_LIST* parts);

+  // Deletes all the partitions in the grid after disowning all the blobs.
+  void DeleteParts();
+
+  // Deletes all the partitions in the grid that are of type BRT_UNKNOWN and
+  // all the blobs in them.
+  void DeleteUnknownParts(TO_BLOCK* block);
+
  // Finds and marks text partitions that represent figure captions.
  void FindFigureCaptions();

@ -78,12 +169,64 @@ class ColPartitionGrid : public BBGrid<ColPartition,
  // Finds the best partner in the given direction for the given partition.
  // Stores the result with AddPartner.
  void FindPartitionPartners(bool upper, ColPartition* part);
+  // Finds the best partner in the given direction for the given partition.
+  // Stores the result with AddPartner.
+  void FindVPartitionPartners(bool to_the_left, ColPartition* part);
  // For every ColPartition with multiple partners in the grid, reduces the
  // number of partners to 0 or 1. If get_desperate is true, goes to more
  // desperate merge methods to merge flowing text before breaking partnerships.
  void RefinePartitionPartners(bool get_desperate);

 private:
+  // Finds and returns a list of candidate ColPartitions to merge with part.
+  // The candidates must overlap search_box, and when merged must not
+  // overlap any other partitions that are not overlapped by each individually.
+  void FindMergeCandidates(const ColPartition* part, const TBOX& search_box,
+                           bool debug, ColPartition_CLIST* candidates);
+
+  // Smoothes the region type/flow type of the given part by looking at local
+  // neigbours and the given image mask. Searches a padded rectangle with the
+  // padding truncated on one size of the part's box in turn for each side,
+  // using the result (if any) that has the least distance to all neighbours
+  // that contribute to the decision. This biases in favor of rectangular
+  // regions without completely enforcing them.
+  // If a good decision cannot be reached, the part is left unchanged.
+  // im_box and rerotation are used to map blob coordinates onto the
+  // nontext_map, which is used to prevent the spread of text neighbourhoods
+  // into images.
+  // Returns true if the partition was changed.
+  bool SmoothRegionType(Pix* nontext_map,
+                        const TBOX& im_box,
+                        const FCOORD& rerotation,
+                        bool debug,
+                        ColPartition* part);
+  // Executes the search for SmoothRegionType in a single direction.
+  // Creates a bounding box that is padded in all directions except direction,
+  // and searches it for other partitions. Finds the nearest collection of
+  // partitions that makes a decisive result (if any) and returns the type
+  // and the distance of the collection. If there are any pixels in the
+  // nontext_map, then the decision is biased towards image.
+  BlobRegionType SmoothInOneDirection(BlobNeighbourDir direction,
+                                      Pix* nontext_map,
+                                      const TBOX& im_box,
+                                      const FCOORD& rerotation,
+                                      bool debug,
+                                      const ColPartition& part,
+                                      int* best_distance);
+  // Counts the partitions in the given search_box by appending the gap
+  // distance (scaled by dist_scaling) of the part from the base_part to the
+  // vector of the appropriate type for the partition. Prior to return, the
+  // vectors in the dists array are sorted in increasing order.
+  // dists must be an array of GenericVectors of size NPT_COUNT.
+  void AccumulatePartDistances(const ColPartition& base_part,
+                               const ICOORD& dist_scaling,
+                               const TBOX& search_box,
+                               Pix* nontext_map,
+                               const TBOX& im_box,
+                               const FCOORD& rerotation,
+                               bool debug,
+                               GenericVector<int>* dists);
+
  // Improves the margins of the ColPartition by searching for
  // neighbours that vertically overlap significantly.
  void FindPartitionMargins(ColPartitionSet* columns, ColPartition* part);
--- a/textord/colpartitionset.cpp
+++ b/textord/colpartitionset.cpp
@ -66,79 +66,13 @@ ColPartition* ColPartitionSet::ColumnContaining(int x, int y) {
  return NULL;
 }

-// Insert the ColPartitions in our list into the given grid.
-void ColPartitionSet::ReturnParts(ColPartition_LIST* parts) {
-  ColPartition_IT it(parts);
-  it.add_list_before(&parts_);
-}
-
-// Merge any significantly overlapping partitions within the this and other,
-// and unique the boxes so that no two partitions use the same box.
-// Return true if any changes were made to either set.
-bool ColPartitionSet::MergeOverlaps(ColPartitionSet* other, WidthCallback* cb) {
-  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
-                                         bounding_box_.bottom()) ||
-               TabFind::WithinTestRegion(2, other->bounding_box_.left(),
-                                         other->bounding_box_.bottom());
-  if (debug) {
-    tprintf("Considering merge on:\n");
-    Print();
-    other->Print();
+// Extract all the parts from the list, relinquishing ownership.
+void ColPartitionSet::RelinquishParts() {
+  ColPartition_IT it(&parts_);
+  while (!it.empty()) {
+    it.extract();
+    it.forward();
  }
-  ColPartition_IT it1(&parts_);
-  ColPartition_IT it2(&other->parts_);
-  bool any_merged = false;
-  it1.mark_cycle_pt();
-  it2.mark_cycle_pt();
-  // Iterate the two lists in parallel, using the fact that they are
-  // sorted by x-coord to keep the iterators in sync.
-  while (!it1.cycled_list() && !it2.cycled_list()) {
-    any_merged = false;
-    ColPartition* part1 = it1.data();
-    ColPartition* part2 = it2.data();
-    if (debug) {
-      tprintf("Vover=%d, HOver=%d, Hcompatible=%d, typesmatch=%d\n",
-              part1->VOverlaps(*part2), part1->HOverlaps(*part2),
-              part1->HCompatible(*part2), part1->TypesMatch(*part2));
-    }
-    if (part1->VOverlaps(*part2) &&
-        part1->HCompatible(*part2) && part1->TypesMatch(*part2)) {
-      // Partitions seem to be mergeable, so absorb part1 into part2.
-      part1->Absorb(it2.extract(), cb);
-      any_merged = true;
-      it1.forward();
-      it2.forward();
-    } else if (part1->HOverlaps(*part2) && part1->TypesMatch(*part2) &&
-               part1->Unique(part2, cb)) {
-      // Unique moved some boxes, so check to see in either partition was
-      // left empty. If not, any_merged is not set true.
-      if (part1->IsEmpty()) {
-        any_merged = true;
-        delete it1.extract();
-        it1.forward();
-        continue;
-      }
-      if (part2->IsEmpty()) {
-        any_merged = true;
-        delete it2.extract();
-        it2.forward();
-        continue;
-      }
-    }
-    if (!any_merged) {
-      // Move on the iterator that point to the leftmost partition.
-      if (part1->IsLeftOf(*part2)) {
-        it1.forward();
-      } else {
-        it2.forward();
-      }
-    }
-  }
-  if (any_merged) {
-    ComputeCoverage();
-    other->ComputeCoverage();
-  }
-  return any_merged;
 }

 // Attempt to improve this by adding partitions or expanding partitions.
@ -245,13 +179,13 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector* column_sets,
  }
  for (int i = 0; i < column_sets->size(); ++i) {
    ColPartitionSet* columns = column_sets->get(i);
-    // In ordering the column set candidates, total_coverage_ is king,
-    // followed by good_column_count_ and then total column_count.
-    bool better = total_coverage_ > columns->total_coverage_;
-    if (total_coverage_ == columns->total_coverage_) {
+    // In ordering the column set candidates, good_coverage_ is king,
+    // followed by good_column_count_ and then bad_coverage_.
+    bool better = good_coverage_ > columns->good_coverage_;
+    if (good_coverage_ == columns->good_coverage_) {
      better = good_column_count_ > columns->good_column_count_;
      if (good_column_count_ == columns->good_column_count_) {
-          better = parts_.length() > columns->parts_.length();
+          better = bad_coverage_ > columns->bad_coverage_;
      }
    }
    if (better) {
@ -278,7 +212,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector* column_sets,
 bool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet* other,
                                        WidthCallback* cb) {
  if (debug) {
-    tprintf("CompatibleColumns testing compability\n");
+    tprintf("CompatibleColumns testing compatibility\n");
    Print();
    other->Print();
  }
@ -295,7 +229,7 @@ bool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet* other,
        tprintf("CompatibleColumns ignoring image partition\n");
        part->Print();
      }
-      continue;  // Image partitions are irrelevant to column compability.
+      continue;  // Image partitions are irrelevant to column compatibility.
    }
    int y = part->MidY();
    int left = part->bounding_box().left();
@ -331,30 +265,15 @@ bool ColPartitionSet::CompatibleColumns(bool debug, ColPartitionSet* other,
      ColPartition* next_left_col = ColumnContaining(next_left, y);
      if (right_col == next_left_col) {
        // There is a column break in this column.
-        // Check for the difference between different column layout and
-        // a pull-out block.
-        int part_box_width = part->bounding_box().width();
-        int part_margin_width = part->right_margin() - part->left_margin();
-        int next_box_width = next_part->bounding_box().width();
-        int next_margin_width = next_part->right_margin() -
-                                next_part->left_margin();
-        int next_right = next_part->bounding_box().right();
-        if (part_box_width < next_margin_width &&
-            next_box_width < part_margin_width) {
+        // This can be due to a figure caption within a column, a pull-out
+        // block, or a simple broken textline that remains to be merged:
+        // all allowed, or a change in column layout: not allowed.
+        // If both partitions are of good width, then it is likely
+        // a change in column layout, otherwise probably an allowed situation.
+        if (part->good_width() && next_part->good_width()) {
          if (debug) {
-            tprintf("CompatibleColumns false due to equal sized columns\n");
-            tprintf("part1 %d-%d = %d, part2 %d-%d = %d\n",
-                    left, right, part->ColumnWidth(),
-                    next_left, next_right, next_part->ColumnWidth());
-            right_col->Print();
-          }
-          return false;  // Must be a new column layout as they are equal size.
-        }
-        ColPartition* next_right_col = ColumnContaining(next_right, y);
-        if (left_col == right_col && next_right_col == next_left_col) {
-          // Column completely contains both. Not allowed.
-          if (debug) {
-            tprintf("CompatibleColumns false due to containing 2 partitions\n");
+            int next_right = next_part->bounding_box().right();
+            tprintf("CompatibleColumns false due to 2 parts of good width\n");
            tprintf("part1 %d-%d, part2 %d-%d\n",
                    left, right, next_left, next_right);
            right_col->Print();
@ -654,8 +573,9 @@ void ColPartitionSet::AccumulateColumnWidthsAndGaps(int* total_width,
 // Provide debug output for this ColPartitionSet and all the ColPartitions.
 void ColPartitionSet::Print() {
  ColPartition_IT it(&parts_);
-  tprintf("Partition set of %d parts, %d good, coverage=%d (%d,%d)->(%d,%d)\n",
-          it.length(), good_column_count_, total_coverage_,
+  tprintf("Partition set of %d parts, %d good, coverage=%d+%d"
+          " (%d,%d)->(%d,%d)\n",
+          it.length(), good_column_count_, good_coverage_, bad_coverage_,
          bounding_box_.left(), bounding_box_.bottom(),
          bounding_box_.right(), bounding_box_.top());
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
@ -669,13 +589,7 @@ void ColPartitionSet::Print() {
 // Add the given partition to the list in the appropriate place.
 void ColPartitionSet::AddPartition(ColPartition* new_part,
                                   ColPartition_IT* it) {
-  bounding_box_ += new_part->bounding_box();
-  if (new_part->good_column() || new_part->good_width()) {
-    total_coverage_ += new_part->ColumnWidth();
-    ++good_column_count_;
-    if (new_part->good_width())
-      ++good_column_count_;
-  }
+  AddPartitionCoverageAndBox(*new_part);
  int new_right = new_part->right_key();
  if (it->data()->left_key() >= new_right)
    it->add_before_stay_put(new_part);
@ -683,22 +597,50 @@ void ColPartitionSet::AddPartition(ColPartition* new_part,
    it->add_after_stay_put(new_part);
 }

-// Compute the coverage and good column count.
+// Compute the coverage and good column count. Coverage is the amount of the
+// width of the page (in pixels) that is covered by ColPartitions, which are
+// used to provide candidate column layouts.
+// Coverage is split into good and bad. Good coverage is provided by
+// ColPartitions of a frequent width (according to the callback function
+// provided by TabFinder::WidthCB, which accesses stored statistics on the
+// widths of ColParititions) and bad coverage is provided by all other
+// ColPartitions, even if they have tab vectors at both sides. Thus:
+// |-----------------------------------------------------------------|
+// |        Double     width    heading                              |
+// |-----------------------------------------------------------------|
+// |-------------------------------| |-------------------------------|
+// |   Common width ColParition    | |  Common width ColPartition    |
+// |-------------------------------| |-------------------------------|
+// the layout with two common-width columns has better coverage than the
+// double width heading, because the coverage is "good," even though less in
+// total coverage than the heading, because the heading coverage is "bad."
 void ColPartitionSet::ComputeCoverage() {
  // Count the number of good columns and sum their width.
  ColPartition_IT it(&parts_);
  good_column_count_ = 0;
-  total_coverage_ = 0;
+  good_coverage_ = 0;
+  bad_coverage_ = 0;
  bounding_box_ = TBOX();
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    ColPartition* part = it.data();
-    bounding_box_ += part->bounding_box();
-    if (part->good_column() || part->good_width()) {
-      total_coverage_ += part->ColumnWidth();
+    AddPartitionCoverageAndBox(*part);
+  }
+}
+
+// Adds the coverage, column count and box for a single partition,
+// without adding it to the list. (Helper factored from ComputeCoverage.)
+void ColPartitionSet::AddPartitionCoverageAndBox(const ColPartition& part) {
+  bounding_box_ += part.bounding_box();
+  int coverage = part.ColumnWidth();
+  if (part.good_width()) {
+    good_coverage_ += coverage;
+    good_column_count_ += 2;
+  } else {
+    if (part.blob_type() < BRT_UNKNOWN)
+      coverage /= 2;
+    if (part.good_column())
      ++good_column_count_;
-      if (part->good_width())
-        ++good_column_count_;
-    }
+    bad_coverage_ += coverage;
  }
 }

--- a/textord/colpartitionset.h
+++ b/textord/colpartitionset.h
@ -66,13 +66,8 @@ class ColPartitionSet : public ELIST_LINK {
  // Return the bounding boxes of columns at the given y-range
  void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);

-  // Move the parts to the output list, giving up ownership.
-  void ReturnParts(ColPartition_LIST* parts);
-
-  // Merge any significantly overlapping partitions within the this and other,
-  // and unique the boxes so that no two partitions use the same box.
-  // Return true if any changes were made to either set.
-  bool MergeOverlaps(ColPartitionSet* other, WidthCallback* cb);
+  // Extract all the parts from the list, relinquishing ownership.
+  void RelinquishParts();

  // Attempt to improve this by adding partitions or expanding partitions.
  void ImproveColumnCandidate(WidthCallback* cb, PartSetVector* src_sets);
@ -133,15 +128,37 @@ class ColPartitionSet : public ELIST_LINK {
  // Add the given partition to the list in the appropriate place.
  void AddPartition(ColPartition* new_part, ColPartition_IT* it);

-  // Compute the coverage and good column count.
+  // Compute the coverage and good column count. Coverage is the amount of the
+  // width of the page (in pixels) that is covered by ColPartitions, which are
+  // used to provide candidate column layouts.
+  // Coverage is split into good and bad. Good coverage is provided by
+  // ColPartitions of a frequent width (according to the callback function
+  // provided by TabFinder::WidthCB, which accesses stored statistics on the
+  // widths of ColParititions) and bad coverage is provided by all other
+  // ColPartitions, even if they have tab vectors at both sides. Thus:
+  // |-----------------------------------------------------------------|
+  // |        Double     width    heading                              |
+  // |-----------------------------------------------------------------|
+  // |-------------------------------| |-------------------------------|
+  // |   Common width ColParition    | |  Common width ColPartition    |
+  // |-------------------------------| |-------------------------------|
+  // the layout with two common-width columns has better coverage than the
+  // double width heading, because the coverage is "good," even though less in
+  // total coverage than the heading, because the heading coverage is "bad."
  void ComputeCoverage();

+  // Adds the coverage, column count and box for a single partition,
+  // without adding it to the list. (Helper factored from ComputeCoverage.)
+  void AddPartitionCoverageAndBox(const ColPartition& part);
+
  // The partitions in this column candidate.
  ColPartition_LIST parts_;
  // The number of partitions that have a frequent column width.
  int good_column_count_;
-  // Total width of all the ColPartitions.
-  int total_coverage_;
+  // Total width of all the good ColPartitions.
+  int good_coverage_;
+  // Total width of all the bad ColPartitions.
+  int bad_coverage_;
  // Bounding box of all partitions in the set.
  TBOX bounding_box_;
 };
--- a/textord/devanagari_processing.cpp
+++ b/textord/devanagari_processing.cpp
@ -165,36 +165,6 @@ bool ShiroRekhaSplitter::Split(bool split_for_pageseg) {
  return true;
 }

-// This method changes the input page image and pix_binary to be the same as
-// the splitted image owned by this object.
-// Any of the parameters can be NULL.
-void ShiroRekhaSplitter::CopySplittedImageTo(IMAGE* page_image,
-                                             Pix** pix_binary) const {
-  ASSERT_HOST(splitted_image_);
-  if (pix_binary) {
-    pixDestroy(pix_binary);
-    *pix_binary = pixClone(splitted_image_);
-  }
-  if (page_image) {
-    page_image->FromPix(splitted_image_);
-  }
-}
-
-// This method changes the input page image and pix_binary to be the same as
-// the original image provided to this object.
-// Any of the parameters can be NULL.
-void ShiroRekhaSplitter::CopyOriginalImageTo(IMAGE* page_image,
-                                             Pix** pix_binary) const {
-  ASSERT_HOST(orig_pix_);
-  if (pix_binary) {
-    pixDestroy(pix_binary);
-    *pix_binary = pixClone(orig_pix_);
-  }
-  if (page_image) {
-    page_image->FromPix(orig_pix_);
-  }
-}
-
 // Method to perform a close operation on the input image. The xheight
 // estimate decides the size of sel used.
 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
@ -395,7 +365,8 @@ void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(
  C_BLOB_LIST not_found_blobs;
  RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
                               new_blobs,
-                               &not_found_blobs);
+                               ((devanagari_split_debugimage && debug_image_) ?
+                                &not_found_blobs : NULL));

  if (devanagari_split_debuglevel > 0) {
    tprintf("After refreshing blobs:\n");
@ -525,4 +496,4 @@ void PixelHistogram::ConstructHorizontalCountHist(Pix* pix) {
  numaDestroy(&counts);
 }

-}
+}  // namespace tesseract.
--- a/textord/devanagari_processing.h
+++ b/textord/devanagari_processing.h
@ -80,16 +80,6 @@ class ShiroRekhaSplitter {
  // splitting. If false, the ocr_split_strategy_ is used.
  bool Split(bool split_for_pageseg);

-  // This method changes the input page image and pix_binary to be the same as
-  // the splitted image owned by this object.
-  // Any of the parameters can be NULL.
-  void CopySplittedImageTo(IMAGE* page_image, Pix** pix_binary) const;
-
-  // This method changes the input page image and pix_binary to be the same as
-  // the original image provided to this object.
-  // Any of the parameters can be NULL.
-  void CopyOriginalImageTo(IMAGE* page_image, Pix** pix_binary) const;
-
  // Clears the memory held by this object.
  void Clear();

@ -212,5 +202,6 @@ class ShiroRekhaSplitter {
                        // performed before CCs are run through splitting.
 };

-}
+}  // namespace tesseract.
+
 #endif  // TESSERACT_TEXTORD_DEVNAGARI_PROCESSING_H_
--- a/textord/edgblob.cpp
+++ b/textord/edgblob.cpp
@ -419,12 +419,11 @@ void empty_buckets(                     // find blobs
    out_it.set_to_list(&outlines);
    do {
      parent_it = bucket_it;     // find outermost
-      do
-      bucket_it.forward();
-      while (!bucket_it.at_first()
-        && !(*parent_it.data() < *bucket_it.data()));
-    }
-    while (!bucket_it.at_first());
+      do {
+        bucket_it.forward();
+      } while (!bucket_it.at_first() &&
+               !(*parent_it.data() < *bucket_it.data()));
+    } while (!bucket_it.at_first());

                                 // move to new list
    out_it.add_after_then_move(parent_it.extract());
--- a/textord/imagefind.cpp
+++ b/textord/imagefind.cpp
--- a/textord/imagefind.h
+++ b/textord/imagefind.h
@ -21,21 +21,40 @@
 #ifndef TESSERACT_TEXTORD_IMAGEFIND_H__
 #define TESSERACT_TEXTORD_IMAGEFIND_H__

+#include "host.h"
+
 struct Boxa;
 struct Pix;
 struct Pixa;
+class TBOX;
+class FCOORD;
+class TO_BLOCK;
+class BLOBNBOX_LIST;

 namespace tesseract {

-// The ImageFinder class is a simple static function wrapper class that
+class ColPartitionGrid;
+class ColPartition_LIST;
+class TabFind;
+
+// The ImageFind class is a simple static function wrapper class that
 // exposes the FindImages function and some useful helper functions.
-class ImageFinder {
+class ImageFind {
 public:
-  // Finds image regions within the source pix (page image) and returns
-  // the image regions as a Boxa, Pixa pair, analgous to pixConnComp.
+  // Finds image regions within the BINARY source pix (page image) and returns
+  // the image regions as a mask image.
+  // The returned pix may be NULL, meaning no images found.
+  // If not NULL, it must be PixDestroyed by the caller.
+  static Pix* FindImages(Pix* pix);
+
+  // Generates a Boxa, Pixa pair from the input binary (image mask) pix,
+  // analgous to pixConnComp, except that connected components which are nearly
+  // rectangular are replaced with solid rectangles.
  // The returned boxa, pixa may be NULL, meaning no images found.
  // If not NULL, they must be destroyed by the caller.
-  static void FindImages(Pix* pix, Boxa** boxa, Pixa** pixa);
+  // Resolution of pix should match the source image (Tesseract::pix_binary_)
+  // so the output coordinate systems match.
+  static void ConnCompAndRectangularize(Pix* pix, Boxa** boxa, Pixa** pixa);

  // Returns true if there is a rectangle in the source pix, such that all
  // pixel rows and column slices outside of it have less than
@ -54,9 +73,84 @@ class ImageFinder {

  // Given an input pix, and a bounding rectangle, the sides of the rectangle
  // are shrunk inwards until they bound any black pixels found within the
-  // original rectangle.
-  static void BoundsWithinRect(Pix* pix, int* x_start, int* y_start,
+  // original rectangle. Returns false if the rectangle contains no black
+  // pixels at all.
+  static bool BoundsWithinRect(Pix* pix, int* x_start, int* y_start,
                               int* x_end, int* y_end);
+
+  // Given a point in 3-D (RGB) space, returns the squared Euclidean distance
+  // of the point from the given line, defined by a pair of points in the 3-D
+  // (RGB) space, line1 and line2.
+  static double ColorDistanceFromLine(const uinT8* line1, const uinT8* line2,
+                                      const uinT8* point);
+
+  // Returns the leptonica combined code for the given RGB triplet.
+  static uinT32 RGB(uinT32 r, uinT32 g, uinT32 b);
+
+  // Returns the input value clipped to a uinT8.
+  static uinT8 ClipToByte(double pixel);
+
+  // Computes the light and dark extremes of color in the given rectangle of
+  // the given pix, which is factor smaller than the coordinate system in rect.
+  // The light and dark points are taken to be the upper and lower 8th-ile of
+  // the most deviant of R, G and B. The value of the other 2 channels are
+  // computed by linear fit against the most deviant.
+  // The colors of the two point are returned in color1 and color2, with the
+  // alpha channel set to a scaled mean rms of the fits.
+  // If color_map1 is not null then it and color_map2 get rect pasted in them
+  // with the two calculated colors, and rms map gets a pasted rect of the rms.
+  // color_map1, color_map2 and rms_map are assumed to be the same scale as pix.
+  static void ComputeRectangleColors(const TBOX& rect, Pix* pix, int factor,
+                                     Pix* color_map1, Pix* color_map2,
+                                     Pix* rms_map,
+                                     uinT8* color1, uinT8* color2);
+
+  // Returns true if there are no black pixels in between the boxes.
+  // The im_box must represent the bounding box of the pix in tesseract
+  // coordinates, which may be negative, due to rotations to make the textlines
+  // horizontal. The boxes are rotated by rotation, which should undo such
+  // rotations, before mapping them onto the pix.
+  static bool BlankImageInBetween(const TBOX& box1, const TBOX& box2,
+                                  const TBOX& im_box, const FCOORD& rotation,
+                                  Pix* pix);
+
+  // Returns the number of pixels in box in the pix.
+  // The im_box must represent the bounding box of the pix in tesseract
+  // coordinates, which may be negative, due to rotations to make the textlines
+  // horizontal. The boxes are rotated by rotation, which should undo such
+  // rotations, before mapping them onto the pix.
+  static int CountPixelsInRotatedBox(TBOX box, const TBOX& im_box,
+                                     const FCOORD& rotation, Pix* pix);
+
+
+  // Locates all the image partitions in the part_grid, that were found by a
+  // previous call to FindImagePartitions, marks them in the image_mask,
+  // removes them from the grid, and deletes them. This makes it possble to
+  // call FindImagePartitions again to produce less broken-up and less
+  // overlapping image partitions.
+  // rerotation specifies how to rotate the partition coords to match
+  // the image_mask, since this function is used after orientation correction.
+  static void TransferImagePartsToImageMask(const FCOORD& rerotation,
+                                            ColPartitionGrid* part_grid,
+                                            Pix* image_mask);
+
+  // Runs a CC analysis on the image_pix mask image, and creates
+  // image partitions from them, cutting out strong text, and merging with
+  // nearby image regions such that they don't interfere with text.
+  // Rotation and rerotation specify how to rotate image coords to match
+  // the blob and partition coords and back again.
+  // The input/output part_grid owns all the created partitions, and
+  // the partitions own all the fake blobs that belong in the partitions.
+  // Since the other blobs in the other partitions will be owned by the block,
+  // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this
+  // situation and collect the image blobs.
+  static void FindImagePartitions(Pix* image_pix,
+                                  const FCOORD& rotation,
+                                  const FCOORD& rerotation,
+                                  TO_BLOCK* block,
+                                  TabFind* tab_grid,
+                                  ColPartitionGrid* part_grid,
+                                  ColPartition_LIST* big_parts);
 };

 }  // namespace tesseract.
--- a/textord/linefind.cpp
+++ b/textord/linefind.cpp
@ -34,129 +34,283 @@
 #endif
 #include "allheaders.h"

-BOOL_VAR(textord_tabfind_show_vlines, false, "Show vertical rule lines");
-
 namespace tesseract {

 /// Denominator of resolution makes max pixel width to allow thin lines.
-const int kThinLineFraction = 30;
+const int kThinLineFraction = 20;
 /// Denominator of resolution makes min pixels to demand line lengths to be.
-const int kMinLineLengthFraction = 8;
+const int kMinLineLengthFraction = 4;
 /// Spacing of cracks across the page to break up tall vertical lines.
 const int kCrackSpacing = 100;
 /// Grid size used by line finder. Not very critical.
 const int kLineFindGridSize = 50;
+// Min width of a line in pixels to be considered thick.
+const int kMinThickLineWidth = 12;
+// Max size of line residue. (The pixels that fail the long thin opening, and
+// therefore don't make it to the candidate line mask, but are nevertheless
+// part of the line.)
+const int kMaxLineResidue = 6;
+// Min length in inches of a line segment that exceeds kMinThickLineWidth in
+// thickness. (Such lines shouldn't break by simple image degradation.)
+const double kThickLengthMultiple = 0.75;
+// Max fraction of line box area that can be occupied by non-line pixels.
+const double kMaxNonLineDensity = 0.25;
+// Max height of a music stave in inches.
+const double kMaxStaveHeight = 1.0;
+// Minimum fraction of pixels in a music rectangle connected to the staves.
+const double kMinMusicPixelFraction = 0.75;

-// Finds vertical line objects in the given pix.
+// Erases the unused blobs from the line_pix image, taking into account
+// whether this was a horizontal or vertical line set.
+static void RemoveUnusedLineSegments(bool horizontal_lines,
+                                     BLOBNBOX_LIST* line_bblobs,
+                                     Pix* line_pix) {
+  int height = pixGetHeight(line_pix);
+  BLOBNBOX_IT bbox_it(line_bblobs);
+  for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
+    BLOBNBOX* blob = bbox_it.data();
+    if (blob->left_tab_type() == TT_MAYBE_ALIGNED) {
+      const TBOX& box = blob->bounding_box();
+      Box* pixbox = NULL;
+      if (horizontal_lines) {
+        // Horizontal lines are in tess format and also have x and y flipped
+        // (to use FindVerticalAlignment) so we have to flip x and y and then
+        // convert to Leptonica by height - flipped x (ie the right edge).
+        // See GetLineBoxes for more explanation.
+        pixbox = boxCreate(box.bottom(), height - box.right(),
+                           box.height(), box.width());
+
+      } else {
+        // For vertical lines, just flip upside-down to convert to Leptonica.
+        // The y position of the box in Leptonica terms is the distance from
+        // the top of the image to the top of the box.
+        pixbox = boxCreate(box.left(), height - box.top(),
+                           box.width(), box.height());
+      }
+      pixClearInRect(line_pix, pixbox);
+      boxDestroy(&pixbox);
+    }
+  }
+}
+
+// Helper subtracts the line_pix image from the src_pix, and removes residue
+// as well by removing components that touch the line, but are not in the
+// non_line_pix mask. It is assumed that the non_line_pix mask has already
+// been prepared to required accuracy.
+static void SubtractLinesAndResidue(Pix* line_pix, Pix* non_line_pix,
+                                    int resolution, Pix* src_pix) {
+  // First remove the lines themselves.
+  pixSubtract(src_pix, src_pix, line_pix);
+  // Subtract the non-lines from the image to get the residue.
+  Pix* residue_pix = pixSubtract(NULL, src_pix, non_line_pix);
+  // Dilate the lines so they touch the residue.
+  Pix* fat_line_pix = pixDilateBrick(NULL, line_pix, 3, 3);
+  // Seed fill the fat lines to get all the residue.
+  pixSeedfillBinary(fat_line_pix, fat_line_pix, residue_pix, 8);
+  // Subtract the residue from the original image.
+  pixSubtract(src_pix, src_pix, fat_line_pix);
+  pixDestroy(&fat_line_pix);
+  pixDestroy(&residue_pix);
+}
+
+// Returns the maximum strokewidth in the given binary image by doubling
+// the maximum of the distance function.
+static int MaxStrokeWidth(Pix* pix) {
+  Pix* dist_pix = pixDistanceFunction(pix, 4, 8, L_BOUNDARY_BG);
+  int width = pixGetWidth(dist_pix);
+  int height = pixGetHeight(dist_pix);
+  int wpl = pixGetWpl(dist_pix);
+  l_uint32* data = pixGetData(dist_pix);
+  // Find the maximum value in the distance image.
+  int max_dist = 0;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int pixel = GET_DATA_BYTE(data, x);
+      if (pixel > max_dist)
+        max_dist = pixel;
+    }
+    data += wpl;
+  }
+  pixDestroy(&dist_pix);
+  return max_dist * 2;
+}
+
+// Returns the number of components in the intersection_pix touched by line_box.
+static int NumTouchingIntersections(Box* line_box, Pix* intersection_pix) {
+  if (intersection_pix == NULL) return 0;
+  Pix* rect_pix = pixClipRectangle(intersection_pix, line_box, NULL);
+  Boxa* boxa = pixConnComp(rect_pix, NULL, 8);
+  pixDestroy(&rect_pix);
+  if (boxa == NULL) return false;
+  int result = boxaGetCount(boxa);
+  boxaDestroy(&boxa);
+  return result;
+}
+
+// Returns the number of black pixels found in the box made by adding the line
+// width to both sides of the line bounding box. (Increasing the smallest
+// dimension of the bounding box.)
+static int CountPixelsAdjacentToLine(int line_width, Box* line_box,
+                                     Pix* nonline_pix) {
+  l_int32 x, y, box_width, box_height;
+  boxGetGeometry(line_box, &x, &y, &box_width, &box_height);
+  if (box_width > box_height) {
+    // horizontal line.
+    int bottom = MIN(pixGetHeight(nonline_pix), y + box_height + line_width);
+    y = MAX(0, y - line_width);
+    box_height = bottom - y;
+  } else {
+    // Vertical line.
+    int right = MIN(pixGetWidth(nonline_pix), x + box_width + line_width);
+    x = MAX(0, x - line_width);
+    box_width = right - x;
+  }
+  Box* box = boxCreate(x, y, box_width, box_height);
+  Pix* rect_pix = pixClipRectangle(nonline_pix, box, NULL);
+  boxDestroy(&box);
+  l_int32 result;
+  pixCountPixels(rect_pix, &result, NULL);
+  pixDestroy(&rect_pix);
+  return result;
+}
+
+// Helper erases false-positive line segments from the input/output line_pix.
+// 1. Since thick lines shouldn't really break up, we can eliminate some false
+//    positives by marking segments that are at least kMinThickLineWidth
+//    thickness, yet have a length less than min_thick_length.
+// 2. Lines that don't have at least 2 intersections with other lines and have
+//    a lot of neighbouring non-lines are probably not lines (perhaps arabic
+//    or Hindi words, or underlines.)
+// Bad line components are erased from line_pix.
+// Returns the number of remaining connected components.
+static int FilterFalsePositives(int resolution, Pix* nonline_pix,
+                                Pix* intersection_pix, Pix* line_pix) {
+  int min_thick_length = static_cast<int>(resolution * kThickLengthMultiple);
+  Pixa* pixa = NULL;
+  Boxa* boxa = pixConnComp(line_pix, &pixa, 8);
+  // Iterate over the boxes to remove false positives.
+  int nboxes = boxaGetCount(boxa);
+  int remaining_boxes = nboxes;
+  for (int i = 0; i < nboxes; ++i) {
+    Box* box = boxaGetBox(boxa, i, L_CLONE);
+    l_int32 x, y, box_width, box_height;
+    boxGetGeometry(box, &x, &y, &box_width, &box_height);
+    Pix* comp_pix = pixaGetPix(pixa, i, L_CLONE);
+    int max_width = MaxStrokeWidth(comp_pix);
+    pixDestroy(&comp_pix);
+    bool bad_line = false;
+    // If the length is too short to stand-alone as a line, and the box width
+    // is thick enough, and the stroke width is thick enough it is bad.
+    if (box_width >= kMinThickLineWidth && box_height >= kMinThickLineWidth &&
+        box_width < min_thick_length && box_height < min_thick_length &&
+        max_width > kMinThickLineWidth) {
+      // Too thick for the length.
+      bad_line = true;
+    }
+    if (!bad_line &&
+        (intersection_pix == NULL ||
+        NumTouchingIntersections(box, intersection_pix) < 2)) {
+      // Test non-line density near the line.
+      int nonline_count = CountPixelsAdjacentToLine(max_width, box,
+                                                    nonline_pix);
+      if (nonline_count > box_height * box_width * kMaxNonLineDensity)
+        bad_line = true;
+    }
+    if (bad_line) {
+      // Not a good line.
+      pixClearInRect(line_pix, box);
+      --remaining_boxes;
+    }
+    boxDestroy(&box);
+  }
+  pixaDestroy(&pixa);
+  boxaDestroy(&boxa);
+  return remaining_boxes;
+}
+
+// Finds vertical and horizontal line objects in the given pix.
 // Uses the given resolution to determine size thresholds instead of any
 // that may be present in the pix.
 // The output vertical_x and vertical_y contain a sum of the output vectors,
 // thereby giving the mean vertical direction.
+// If pix_music_mask != NULL, and music is detected, a mask of the staves
+// and anything that is connected (bars, notes etc.) will be returned in
+// pix_music_mask, the mask subtracted from pix, and the lines will not
+// appear in v_lines or h_lines.
 // The output vectors are owned by the list and Frozen (cannot refit) by
 // having no boxes, as there is no need to refit or merge separator lines.
-void LineFinder::FindVerticalLines(int resolution,  Pix* pix,
-                                   int* vertical_x, int* vertical_y,
-                                   TabVector_LIST* vectors) {
-  Pix* line_pix;
-  Boxa* boxes = GetVLineBoxes(resolution, pix, &line_pix);
-  C_BLOB_LIST line_cblobs;
-  int width = pixGetWidth(pix);
-  int height = pixGetHeight(pix);
-  ConvertBoxaToBlobs(width, height, &boxes, &line_cblobs);
-  // Make the BLOBNBOXes from the C_BLOBs.
-  BLOBNBOX_LIST line_bblobs;
-  C_BLOB_IT blob_it(&line_cblobs);
-  BLOBNBOX_IT bbox_it(&line_bblobs);
-  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
-    C_BLOB* cblob = blob_it.data();
-    BLOBNBOX* bblob = new BLOBNBOX(cblob);
-    bbox_it.add_to_end(bblob);
+// The detected lines are removed from the pix.
+void LineFinder::FindAndRemoveLines(int resolution, bool debug, Pix* pix,
+                                    int* vertical_x, int* vertical_y,
+                                    Pix** pix_music_mask,
+                                    TabVector_LIST* v_lines,
+                                    TabVector_LIST* h_lines) {
+  if (pix == NULL || vertical_x == NULL || vertical_y == NULL) {
+    tprintf("Error in parameters for LineFinder::FindAndRemoveLines\n");
+    return;
  }
-  ICOORD bleft(0, 0);
-  ICOORD tright(width, height);
-  FindLineVectors(bleft, tright, &line_bblobs, vertical_x, vertical_y, vectors);
-  if (!vectors->empty()) {
-    // Some lines were found, so erase the unused blobs from the line image
-    // and then subtract the line image from the source.
-    bbox_it.move_to_first();
-    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
-      BLOBNBOX* blob = bbox_it.data();
-      if (blob->left_tab_type() == TT_UNCONFIRMED) {
-        const TBOX& box = blob->bounding_box();
-        Box* pixbox = boxCreate(box.left(), height - box.top(),
-                                box.width(), box.height());
-        pixClearInRect(line_pix, pixbox);
-        boxDestroy(&pixbox);
-      }
+  Pix* pix_vline = NULL;
+  Pix* pix_non_vline = NULL;
+  Pix* pix_hline = NULL;
+  Pix* pix_non_hline = NULL;
+  Pix* pix_intersections = NULL;
+  Pixa* pixa_display = debug ? pixaCreate(0) : NULL;
+  GetLineMasks(resolution, pix, &pix_vline, &pix_non_vline, &pix_hline,
+               &pix_non_hline, &pix_intersections, pix_music_mask,
+               pixa_display);
+  // Find lines, convert to TabVector_LIST and remove those that are used.
+  FindAndRemoveVLines(resolution, pix_intersections, vertical_x, vertical_y,
+                      &pix_vline, pix_non_vline, pix, v_lines);
+  if (pix_hline != NULL) {
+    // Recompute intersections and re-filter false positive h-lines.
+    if (pix_vline != NULL)
+      pixAnd(pix_intersections, pix_vline, pix_hline);
+    else
+      pixDestroy(&pix_intersections);
+    if (!FilterFalsePositives(resolution, pix_non_hline, pix_intersections,
+                              pix_hline)) {
+      pixDestroy(&pix_hline);
    }
-    pixDilateBrick(line_pix, line_pix, 1, 3);
-    pixSubtract(pix, pix, line_pix);
-    if (textord_tabfind_show_vlines)
-      pixWrite("vlinesclean.png", line_pix, IFF_PNG);
-    ICOORD vertical;
-    vertical.set_with_shrink(*vertical_x, *vertical_y);
-    TabVector::MergeSimilarTabVectors(vertical, vectors, NULL);
  }
-  pixDestroy(&line_pix);
-}
+  FindAndRemoveHLines(resolution, pix_intersections, *vertical_x, *vertical_y,
+                      &pix_hline, pix_non_hline, pix, h_lines);
+  if (pixa_display != NULL && pix_vline != NULL)
+    pixaAddPix(pixa_display, pix_vline, L_CLONE);
+  if (pixa_display != NULL && pix_hline != NULL)
+    pixaAddPix(pixa_display, pix_hline, L_CLONE);
+  if (pix_vline != NULL && pix_hline != NULL) {
+    // Remove joins (intersections) where lines cross, and the residue.
+    // Recalculate the intersections, since some lines have been deleted.
+    pixAnd(pix_intersections, pix_vline, pix_hline);
+    // Fatten up the intersections and seed-fill to get the intersection
+    // residue.
+    Pix* pix_join_residue = pixDilateBrick(NULL, pix_intersections, 5, 5);
+    pixSeedfillBinary(pix_join_residue, pix_join_residue, pix, 8);
+    // Now remove the intersection residue.
+    pixSubtract(pix, pix, pix_join_residue);
+    pixDestroy(&pix_join_residue);
+  }
+  // Remove any detected music.
+  if (pix_music_mask != NULL && *pix_music_mask != NULL) {
+    if (pixa_display != NULL)
+      pixaAddPix(pixa_display, *pix_music_mask, L_CLONE);
+    pixSubtract(pix, pix, *pix_music_mask);
+  }
+  if (pixa_display != NULL)
+    pixaAddPix(pixa_display, pix, L_CLONE);

-// Finds horizontal line objects in the given pix.
-// Uses the given resolution to determine size thresholds instead of any
-// that may be present in the pix.
-// The output vectors are owned by the list and Frozen (cannot refit) by
-// having no boxes, as there is no need to refit or merge separator lines.
-void LineFinder::FindHorizontalLines(int resolution,  Pix* pix,
-                                     TabVector_LIST* vectors) {
-  Pix* line_pix;
-  Boxa* boxes = GetHLineBoxes(resolution, pix, &line_pix);
-  C_BLOB_LIST line_cblobs;
-  int width = pixGetWidth(pix);
-  int height = pixGetHeight(pix);
-  ConvertBoxaToBlobs(height, width, &boxes, &line_cblobs);
-  // Make the BLOBNBOXes from the C_BLOBs.
-  BLOBNBOX_LIST line_bblobs;
-  C_BLOB_IT blob_it(&line_cblobs);
-  BLOBNBOX_IT bbox_it(&line_bblobs);
-  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
-    C_BLOB* cblob = blob_it.data();
-    BLOBNBOX* bblob = new BLOBNBOX(cblob);
-    bbox_it.add_to_end(bblob);
+  pixDestroy(&pix_vline);
+  pixDestroy(&pix_non_vline);
+  pixDestroy(&pix_hline);
+  pixDestroy(&pix_non_hline);
+  pixDestroy(&pix_intersections);
+  if (pixa_display != NULL) {
+#if LIBLEPT_MINOR_VERSION >= 69 || LIBLEPT_MAJOR_VERSION > 1
+    pixaConvertToPdf(pixa_display, resolution, 1.0f, 0, 0, "LineFinding",
+                     "vhlinefinding.pdf");
+#endif
+    pixaDestroy(&pixa_display);
  }
-  ICOORD bleft(0, 0);
-  ICOORD tright(height, width);
-  int vertical_x, vertical_y;
-  FindLineVectors(bleft, tright, &line_bblobs, &vertical_x, &vertical_y,
-                  vectors);
-  if (!vectors->empty()) {
-    // Some lines were found, so erase the unused blobs from the line image
-    // and then subtract the line image from the source.
-    bbox_it.move_to_first();
-    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
-      BLOBNBOX* blob = bbox_it.data();
-      if (blob->left_tab_type() == TT_UNCONFIRMED) {
-        const TBOX& box = blob->bounding_box();
-        // Coords are in tess format so filp x and y and then covert
-        // to leptonica by height -y.
-        Box* pixbox = boxCreate(box.bottom(), height - box.right(),
-                                box.height(), box.width());
-        pixClearInRect(line_pix, pixbox);
-        boxDestroy(&pixbox);
-      }
-    }
-    pixDilateBrick(line_pix, line_pix, 3, 1);
-    pixSubtract(pix, pix, line_pix);
-    if (textord_tabfind_show_vlines)
-      pixWrite("hlinesclean.png", line_pix, IFF_PNG);
-    ICOORD vertical;
-    vertical.set_with_shrink(vertical_x, vertical_y);
-    TabVector::MergeSimilarTabVectors(vertical, vectors, NULL);
-    // Iterate the vectors to flip them.
-    TabVector_IT h_it(vectors);
-    for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {
-      h_it.data()->XYFlip();
-    }
-  }
-  pixDestroy(&line_pix);
 }

 // Converts the Boxa array to a list of C_BLOB, getting rid of severely
@ -176,8 +330,8 @@ void LineFinder::ConvertBoxaToBlobs(int image_width, int image_height,
    // Make a C_OUTLINE from the leptonica box. This is a bit of a hack,
    // as there is no outline, just a bounding box, but with some very
    // small changes to coutln.cpp, it works nicely.
-    ICOORD top_left(x, image_height - y);
-    ICOORD bot_right(x + width, image_height - (y + height));
+    ICOORD top_left(x, y);
+    ICOORD bot_right(x + width, y + height);
    CRACKEDGE startpt;
    startpt.pos = top_left;
    C_OUTLINE* outline = new C_OUTLINE(&startpt, top_left, bot_right, 0);
@ -197,6 +351,85 @@ void LineFinder::ConvertBoxaToBlobs(int image_width, int image_height,
  boxaDestroy(boxes);
 }

+// Finds vertical line objects in pix_vline and removes the from src_pix.
+// Uses the given resolution to determine size thresholds instead of any
+// that may be present in the pix.
+// The output vertical_x and vertical_y contain a sum of the output vectors,
+// thereby giving the mean vertical direction.
+// The output vectors are owned by the list and Frozen (cannot refit) by
+// having no boxes, as there is no need to refit or merge separator lines.
+// If no good lines are found, pix_vline is destroyed.
+// None of the input pointers may be NULL, and if *pix_vline is NULL then
+// the function does nothing.
+void LineFinder::FindAndRemoveVLines(int resolution,
+                                     Pix* pix_intersections,
+                                     int* vertical_x, int* vertical_y,
+                                     Pix** pix_vline, Pix* pix_non_vline,
+                                     Pix* src_pix, TabVector_LIST* vectors) {
+  if (pix_vline == NULL || *pix_vline == NULL) return;
+  C_BLOB_LIST line_cblobs;
+  BLOBNBOX_LIST line_bblobs;
+  GetLineBoxes(false, *pix_vline, pix_intersections,
+               &line_cblobs, &line_bblobs);
+  int width = pixGetWidth(src_pix);
+  int height = pixGetHeight(src_pix);
+  ICOORD bleft(0, 0);
+  ICOORD tright(width, height);
+  FindLineVectors(bleft, tright, &line_bblobs, vertical_x, vertical_y, vectors);
+  if (!vectors->empty()) {
+    RemoveUnusedLineSegments(false, &line_bblobs, *pix_vline);
+    SubtractLinesAndResidue(*pix_vline, pix_non_vline, resolution, src_pix);
+    ICOORD vertical;
+    vertical.set_with_shrink(*vertical_x, *vertical_y);
+    TabVector::MergeSimilarTabVectors(vertical, vectors, NULL);
+  } else {
+    pixDestroy(pix_vline);
+  }
+}
+
+// Finds horizontal line objects in pix_hline and removes them from src_pix.
+// Uses the given resolution to determine size thresholds instead of any
+// that may be present in the pix.
+// The output vertical_x and vertical_y contain a sum of the output vectors,
+// thereby giving the mean vertical direction.
+// The output vectors are owned by the list and Frozen (cannot refit) by
+// having no boxes, as there is no need to refit or merge separator lines.
+// If no good lines are found, pix_hline is destroyed.
+// None of the input pointers may be NULL, and if *pix_hline is NULL then
+// the function does nothing.
+void LineFinder::FindAndRemoveHLines(int resolution,
+                                     Pix* pix_intersections,
+                                     int vertical_x, int vertical_y,
+                                     Pix** pix_hline, Pix* pix_non_hline,
+                                     Pix* src_pix, TabVector_LIST* vectors) {
+  if (pix_hline == NULL || *pix_hline == NULL) return;
+  C_BLOB_LIST line_cblobs;
+  BLOBNBOX_LIST line_bblobs;
+  GetLineBoxes(true, *pix_hline, pix_intersections, &line_cblobs, &line_bblobs);
+  int width = pixGetWidth(src_pix);
+  int height = pixGetHeight(src_pix);
+  ICOORD bleft(0, 0);
+  ICOORD tright(height, width);
+  FindLineVectors(bleft, tright, &line_bblobs, &vertical_x, &vertical_y,
+                  vectors);
+  if (!vectors->empty()) {
+    RemoveUnusedLineSegments(true, &line_bblobs, *pix_hline);
+    SubtractLinesAndResidue(*pix_hline, pix_non_hline, resolution, src_pix);
+    ICOORD vertical;
+    vertical.set_with_shrink(vertical_x, vertical_y);
+    TabVector::MergeSimilarTabVectors(vertical, vectors, NULL);
+    // Iterate the vectors to flip them. x and y were flipped for horizontal
+    // lines, so FindLineVectors can work just with the vertical case.
+    // See GetLineBoxes for more on the flip.
+    TabVector_IT h_it(vectors);
+    for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {
+      h_it.data()->XYFlip();
+    }
+  } else {
+    pixDestroy(pix_hline);
+  }
+}
+
 // Finds vertical lines in the given list of BLOBNBOXes. bleft and tright
 // are the bounds of the image on which the input line_bblobs were found.
 // The input line_bblobs list is const really.
@ -213,7 +446,7 @@ void LineFinder::FindLineVectors(const ICOORD& bleft, const ICOORD& tright,
  AlignedBlob blob_grid(kLineFindGridSize, bleft, tright);
  for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
    BLOBNBOX* bblob = bbox_it.data();
-    bblob->set_left_tab_type(TT_UNCONFIRMED);
+    bblob->set_left_tab_type(TT_MAYBE_ALIGNED);
    bblob->set_left_rule(bleft.x());
    bblob->set_right_rule(tright.x());
    bblob->set_left_crossing_rule(bleft.x());
@ -221,20 +454,18 @@ void LineFinder::FindLineVectors(const ICOORD& bleft, const ICOORD& tright,
    blob_grid.InsertBBox(false, true, bblob);
    ++b_count;
  }
-  if (textord_debug_tabfind)
-    tprintf("Inserted %d line blobs into grid\n", b_count);
  if (b_count == 0)
    return;

  // Search the entire grid, looking for vertical line vectors.
-  GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> lsearch(&blob_grid);
+  BlobGridSearch lsearch(&blob_grid);
  BLOBNBOX* bbox;
  TabVector_IT vector_it(vectors);
  *vertical_x = 0;
  *vertical_y = 1;
  lsearch.StartFullSearch();
  while ((bbox = lsearch.NextFullSearch()) != NULL) {
-    if (bbox->left_tab_type() == TT_UNCONFIRMED) {
+    if (bbox->left_tab_type() == TT_MAYBE_ALIGNED) {
      const TBOX& box = bbox->bounding_box();
      if (AlignedBlob::WithinTestRegion(2, box.left(), box.bottom()))
        tprintf("Finding line vector starting at bbox (%d,%d)\n",
@ -249,89 +480,268 @@ void LineFinder::FindLineVectors(const ICOORD& bleft, const ICOORD& tright,
      }
    }
  }
-  ScrollView* line_win = NULL;
-  if (textord_tabfind_show_vlines) {
-    line_win = blob_grid.MakeWindow(0, 50, "Vlines");
-    blob_grid.DisplayBoxes(line_win);
-    line_win = blob_grid.DisplayTabs("Vlines", line_win);
-  }
 }

-// Get a set of bounding boxes of possible vertical lines in the image.
-// The input resolution overrides any resolution set in src_pix.
-// The output line_pix contains just all the detected lines.
-Boxa* LineFinder::GetVLineBoxes(int resolution, Pix* src_pix, Pix** line_pix) {
-  // Remove any parts of 1 inch/kThinLineFraction wide or more, by opening
-  // away the thin lines and subtracting what's left.
-  // This is very generous and will leave in even quite wide lines.
-  Pix* pixt1 = pixOpenBrick(NULL, src_pix, resolution / kThinLineFraction, 1);
-  pixSubtract(pixt1, src_pix, pixt1);
-  // Spread sideways to allow for some skew.
-  Pix* pixt2 = pixDilateBrick(NULL, pixt1, 3, 1);
-  // Now keep only tall stuff of height at least 1 inch/kMinLineLengthFraction.
-  pixOpenBrick(pixt1, pixt2, 1, resolution / kMinLineLengthFraction);
-  pixDestroy(&pixt2);
-  // Put a single pixel crack in every line at an arbitrary spacing,
-  // so they break up and the bounding boxes can be used to get the
-  // direction accurately enough without needing outlines.
-  int wpl = pixGetWpl(pixt1);
-  int height = pixGetHeight(pixt1);
-  l_uint32* data = pixGetData(pixt1);
-  for (int y = kCrackSpacing; y < height; y += kCrackSpacing) {
-    memset(data + wpl * y, 0, wpl * sizeof(*data));
+// Returns a Pix music mask if music is detected.
+// Any vertical line that has at least 5 intersections in sufficient density
+// is taken to be a bar. Bars are used as a seed and the entire touching
+// component is added to the output music mask and subtracted from the lines.
+// Returns NULL and does minimal work if no music is found.
+static Pix* FilterMusic(int resolution, Pix* pix_closed,
+                        Pix* pix_vline, Pix* pix_hline,
+                        l_int32* v_empty, l_int32* h_empty) {
+  int max_stave_height = static_cast<int>(resolution * kMaxStaveHeight);
+  Pix* intersection_pix = pixAnd(NULL, pix_vline, pix_hline);
+  Boxa* boxa = pixConnComp(pix_vline, NULL, 8);
+  // Iterate over the boxes to find music bars.
+  int nboxes = boxaGetCount(boxa);
+  Pix* music_mask = NULL;
+  for (int i = 0; i < nboxes; ++i) {
+    Box* box = boxaGetBox(boxa, i, L_CLONE);
+    l_int32 x, y, box_width, box_height;
+    boxGetGeometry(box, &x, &y, &box_width, &box_height);
+    int joins = NumTouchingIntersections(box, intersection_pix);
+    // Test for the join density being at least 5 per max_stave_height,
+    // ie (joins-1)/box_height >= (5-1)/max_stave_height.
+    if (joins >= 5 && (joins - 1) * max_stave_height >= 4 * box_height) {
+      // This is a music bar. Add to the mask.
+      if (music_mask == NULL)
+        music_mask = pixCreate(pixGetWidth(pix_vline), pixGetHeight(pix_vline),
+                               1);
+      pixSetInRect(music_mask, box);
+    }
+    boxDestroy(&box);
  }
-  if (textord_tabfind_show_vlines)
-    pixWrite("vlines.png", pixt1, IFF_PNG);
-  Boxa* boxa = pixConnComp(pixt1, NULL, 8);
-  *line_pix = pixt1;
-  return boxa;
-}
-
-// Get a set of bounding boxes of possible horizontal lines in the image.
-// The input resolution overrides any resolution set in src_pix.
-// The output line_pix contains just all the detected lines.
-// The output boxes undergo the transformation (x,y)->(height-y,x) so the
-// lines can be found with a vertical line finder afterwards.
-// This transformation allows a simple x/y flip to reverse it in tesseract
-// coordinates and it is faster to flip the lines than rotate the image.
-Boxa* LineFinder::GetHLineBoxes(int resolution, Pix* src_pix, Pix** line_pix) {
-  // Remove any parts of 1 inch/kThinLineFraction high or more, by opening
-  // away the thin lines and subtracting what's left.
-  // This is very generous and will leave in even quite wide lines.
-  Pix* pixt1 = pixOpenBrick(NULL, src_pix, 1, resolution / kThinLineFraction);
-  pixSubtract(pixt1, src_pix, pixt1);
-  // Spread vertically to allow for some skew.
-  Pix* pixt2 = pixDilateBrick(NULL, pixt1, 1, 3);
-  // Now keep only wide stuff of width at least 1 inch/kMinLineLengthFraction.
-  pixOpenBrick(pixt1, pixt2, resolution / kMinLineLengthFraction, 1);
-  pixDestroy(&pixt2);
-  // Put a single pixel crack in every line at an arbitrary spacing,
-  // so they break up and the bounding boxes can be used to get the
-  // direction accurately enough without needing outlines.
-  int wpl = pixGetWpl(pixt1);
-  int width = pixGetWidth(pixt1);
-  int height = pixGetHeight(pixt1);
-  l_uint32* data = pixGetData(pixt1);
-  for (int y = 0; y < height; ++y, data += wpl) {
-    for (int x = kCrackSpacing; x < width; x += kCrackSpacing) {
-      CLEAR_DATA_BIT(data, x);
+  boxaDestroy(&boxa);
+  pixDestroy(&intersection_pix);
+  if (music_mask != NULL) {
+    // The mask currently contains just the bars. Use the mask as a seed
+    // and the pix_closed as the mask for a seedfill to get all the
+    // intersecting staves.
+    pixSeedfillBinary(music_mask, music_mask, pix_closed, 8);
+    // Filter out false positives. CCs in the music_mask should be the vast
+    // majority of the pixels in their bounding boxes, as we expect just a
+    // tiny amount of text, a few phrase marks, and crescendo etc left.
+    Boxa* boxa = pixConnComp(music_mask, NULL, 8);
+    // Iterate over the boxes to find music components.
+    int nboxes = boxaGetCount(boxa);
+    for (int i = 0; i < nboxes; ++i) {
+      Box* box = boxaGetBox(boxa, i, L_CLONE);
+      Pix* rect_pix = pixClipRectangle(music_mask, box, NULL);
+      l_int32 music_pixels;
+      pixCountPixels(rect_pix, &music_pixels, NULL);
+      pixDestroy(&rect_pix);
+      rect_pix = pixClipRectangle(pix_closed, box, NULL);
+      l_int32 all_pixels;
+      pixCountPixels(rect_pix, &all_pixels, NULL);
+      pixDestroy(&rect_pix);
+      if (music_pixels < kMinMusicPixelFraction * all_pixels) {
+        // False positive. Delete from the music mask.
+        pixClearInRect(music_mask, box);
+      }
+      boxDestroy(&box);
+    }
+    l_int32 no_remaining_music;
+    boxaDestroy(&boxa);
+    pixZero(music_mask, &no_remaining_music);
+    if (no_remaining_music) {
+      pixDestroy(&music_mask);
+    } else {
+      pixSubtract(pix_vline, pix_vline, music_mask);
+      pixSubtract(pix_hline, pix_hline, music_mask);
+      // We may have deleted all the lines
+      pixZero(pix_vline, v_empty);
+      pixZero(pix_hline, h_empty);
    }
  }
-  if (textord_tabfind_show_vlines)
-    pixWrite("hlines.png", pixt1, IFF_PNG);
-  Boxa* boxa = pixConnComp(pixt1, NULL, 8);
-  *line_pix = pixt1;
+  return music_mask;
+}

-  // Iterate the boxes to flip x and y.
-  int nboxes = boxaGetCount(boxa);
-  for (int i = 0; i < nboxes; ++i) {
-    l_int32 x, y, box_width, box_height;
-    boxaGetBoxGeometry(boxa, i, &x, &y, &box_width, &box_height);
-    Box* box = boxCreate(height - (y + box_height),
-                         width - (x + box_width), box_height, box_width);
-    boxaReplaceBox(boxa, i, box);
+// Most of the heavy lifting of line finding. Given src_pix and its separate
+// resolution, returns image masks:
+// pix_vline           candidate vertical lines.
+// pix_non_vline       pixels that didn't look like vertical lines.
+// pix_hline           candidate horizontal lines.
+// pix_non_hline       pixels that didn't look like horizontal lines.
+// pix_intersections   pixels where vertical and horizontal lines meet.
+// pix_music_mask      candidate music staves.
+// This function promises to initialize all the output (2nd level) pointers,
+// but any of the returns that are empty will be NULL on output.
+// None of the input (1st level) pointers may be NULL except pix_music_mask,
+// which will disable music detection, and pixa_display.
+void LineFinder::GetLineMasks(int resolution, Pix* src_pix,
+                              Pix** pix_vline, Pix** pix_non_vline,
+                              Pix** pix_hline, Pix** pix_non_hline,
+                              Pix** pix_intersections, Pix** pix_music_mask,
+                              Pixa* pixa_display) {
+  int max_line_width = resolution / kThinLineFraction;
+  int min_line_length = resolution / kMinLineLengthFraction;
+  if (pixa_display != NULL) {
+    tprintf("Image resolution = %d, max line width = %d, min length=%d\n",
+            resolution, max_line_width, min_line_length);
+  }
+  int closing_brick = max_line_width / 3;
+
+  // Close up small holes, making it less likely that false alarms are found
+  // in thickened text (as it will become more solid) and also smoothing over
+  // some line breaks and nicks in the edges of the lines.
+  Pix* pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
+  if (pixa_display != NULL)
+    pixaAddPix(pixa_display, pix_closed, L_CLONE);
+  // Open up with a big box to detect solid areas, which can then be subtracted.
+  // This is very generous and will leave in even quite wide lines.
+  Pix* pix_solid = pixOpenBrick(NULL, pix_closed, max_line_width,
+                                max_line_width);
+  if (pixa_display != NULL)
+    pixaAddPix(pixa_display, pix_solid, L_CLONE);
+  Pix* pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
+  pixDestroy(&pix_solid);
+  // Now open up in both directions independently to find lines of at least
+  // 1 inch/kMinLineLengthFraction in length.
+  if (pixa_display != NULL)
+    pixaAddPix(pixa_display, pix_hollow, L_CLONE);
+  *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
+  *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
+  pixDestroy(&pix_hollow);
+  // Lines are sufficiently rare, that it is worth checking for a zero image.
+  l_int32 v_empty = 0;
+  l_int32 h_empty = 0;
+  pixZero(*pix_vline, &v_empty);
+  pixZero(*pix_hline, &h_empty);
+  if (pix_music_mask != NULL) {
+    if (!v_empty && !h_empty) {
+      *pix_music_mask = FilterMusic(resolution, pix_closed,
+                                    *pix_vline, *pix_hline,
+                                    &v_empty, &h_empty);
+    } else {
+      *pix_music_mask = NULL;
+    }
+  }
+  pixDestroy(&pix_closed);
+  Pix* pix_nonlines = NULL;
+  *pix_intersections = NULL;
+  Pix* extra_non_hlines = NULL;
+  if (!v_empty) {
+    // Subtract both line candidates from the source to get definite non-lines.
+    pix_nonlines = pixSubtract(NULL, src_pix, *pix_vline);
+    if (!h_empty) {
+      pixSubtract(pix_nonlines, pix_nonlines, *pix_hline);
+      // Intersections are a useful indicator for likelihood of being a line.
+      *pix_intersections = pixAnd(NULL, *pix_vline, *pix_hline);
+      // Candidate vlines are not hlines (apart from the intersections)
+      // and vice versa.
+      extra_non_hlines = pixSubtract(NULL, *pix_vline, *pix_intersections);
+    }
+    *pix_non_vline = pixErodeBrick(NULL, pix_nonlines, kMaxLineResidue, 1);
+    pixSeedfillBinary(*pix_non_vline, *pix_non_vline, pix_nonlines, 8);
+    if (!h_empty) {
+      // Candidate hlines are not vlines.
+      pixOr(*pix_non_vline, *pix_non_vline, *pix_hline);
+      pixSubtract(*pix_non_vline, *pix_non_vline, *pix_intersections);
+    }
+    if (!FilterFalsePositives(resolution, *pix_non_vline, *pix_intersections,
+                              *pix_vline))
+      pixDestroy(pix_vline);  // No candidates left.
+  } else {
+    // No vertical lines.
+    pixDestroy(pix_vline);
+    *pix_non_vline = NULL;
+    if (!h_empty) {
+      pix_nonlines = pixSubtract(NULL, src_pix, *pix_hline);
+    }
+  }
+  if (h_empty) {
+    pixDestroy(pix_hline);
+    *pix_non_hline = NULL;
+    if (v_empty) {
+      return;
+    }
+  } else {
+    *pix_non_hline = pixErodeBrick(NULL, pix_nonlines, 1, kMaxLineResidue);
+    pixSeedfillBinary(*pix_non_hline, *pix_non_hline, pix_nonlines, 8);
+    if (extra_non_hlines != NULL) {
+      pixOr(*pix_non_hline, *pix_non_hline, extra_non_hlines);
+      pixDestroy(&extra_non_hlines);
+    }
+    if (!FilterFalsePositives(resolution, *pix_non_hline, *pix_intersections,
+                              *pix_hline))
+      pixDestroy(pix_hline);  // No candidates left.
+  }
+  if (pixa_display != NULL) {
+    if (*pix_vline != NULL) pixaAddPix(pixa_display, *pix_vline, L_CLONE);
+    if (*pix_hline != NULL) pixaAddPix(pixa_display, *pix_hline, L_CLONE);
+    if (pix_nonlines != NULL) pixaAddPix(pixa_display, pix_nonlines, L_CLONE);
+    if (*pix_non_vline != NULL)
+      pixaAddPix(pixa_display, *pix_non_vline, L_CLONE);
+    if (*pix_non_hline != NULL)
+      pixaAddPix(pixa_display, *pix_non_hline, L_CLONE);
+    if (*pix_intersections != NULL)
+      pixaAddPix(pixa_display, *pix_intersections, L_CLONE);
+    if (pix_music_mask != NULL && *pix_music_mask != NULL)
+      pixaAddPix(pixa_display, *pix_music_mask, L_CLONE);
+  }
+  pixDestroy(&pix_nonlines);
+}
+
+// Returns a list of boxes corresponding to the candidate line segments. Sets
+// the line_crossings member of the boxes so we can later determin the number
+// of intersections touched by a full line.
+void LineFinder::GetLineBoxes(bool horizontal_lines,
+                              Pix* pix_lines, Pix* pix_intersections,
+                              C_BLOB_LIST* line_cblobs,
+                              BLOBNBOX_LIST* line_bblobs) {
+  // Put a single pixel crack in every line at an arbitrary spacing,
+  // so they break up and the bounding boxes can be used to get the
+  // direction accurately enough without needing outlines.
+  int wpl = pixGetWpl(pix_lines);
+  int width = pixGetWidth(pix_lines);
+  int height = pixGetHeight(pix_lines);
+  l_uint32* data = pixGetData(pix_lines);
+  if (horizontal_lines) {
+    for (int y = 0; y < height; ++y, data += wpl) {
+      for (int x = kCrackSpacing; x < width; x += kCrackSpacing) {
+        CLEAR_DATA_BIT(data, x);
+      }
+    }
+  } else {
+    for (int y = kCrackSpacing; y < height; y += kCrackSpacing) {
+      memset(data + wpl * y, 0, wpl * sizeof(*data));
+    }
+  }
+  // Get the individual connected components
+  Boxa* boxa = pixConnComp(pix_lines, NULL, 8);
+  ConvertBoxaToBlobs(width, height, &boxa, line_cblobs);
+  // Make the BLOBNBOXes from the C_BLOBs.
+  C_BLOB_IT blob_it(line_cblobs);
+  BLOBNBOX_IT bbox_it(line_bblobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    C_BLOB* cblob = blob_it.data();
+    BLOBNBOX* bblob = new BLOBNBOX(cblob);
+    bbox_it.add_to_end(bblob);
+    // Determine whether the line segment touches two intersections.
+    const TBOX& bbox = bblob->bounding_box();
+    Box* box = boxCreate(bbox.left(), bbox.bottom(),
+                         bbox.width(), bbox.height());
+    bblob->set_line_crossings(NumTouchingIntersections(box, pix_intersections));
+    boxDestroy(&box);
+    // Transform the bounding box prior to finding lines. To save writing
+    // two line finders, flip x and y for horizontal lines and re-use the
+    // tab-stop detection code. For vertical lines we still have to flip the
+    // y-coordinates to switch from leptonica coords to tesseract coords.
+    if (horizontal_lines) {
+      // Note that we have Leptonica coords stored in a Tesseract box, so that
+      // bbox.bottom(), being the MIN y coord, is actually the top, so to get
+      // back to Leptonica coords in RemoveUnusedLineSegments, we have to
+      // use height - box.right() as the top, which looks very odd.
+      TBOX new_box(height - bbox.top(), bbox.left(),
+                   height - bbox.bottom(), bbox.right());
+      bblob->set_bounding_box(new_box);
+    } else {
+      TBOX new_box(bbox.left(), height - bbox.top(),
+                   bbox.right(), height - bbox.bottom());
+      bblob->set_bounding_box(new_box);
+    }
  }
-  return boxa;
 }

 }  // namespace tesseract.
--- a/textord/linefind.h
+++ b/textord/linefind.h
@ -21,8 +21,9 @@
 #ifndef TESSERACT_TEXTORD_LINEFIND_H__
 #define TESSERACT_TEXTORD_LINEFIND_H__

-struct Pix;
 struct Boxa;
+struct Pix;
+struct Pixa;
 class C_BLOB_LIST;
 class BLOBNBOX_LIST;
 class ICOORD;
@ -38,7 +39,8 @@ class TabVector_LIST;
 class LineFinder {
 public:
  /**
-   * Finds vertical line objects in the given pix.
+   * Finds vertical and horizontal line objects in the given pix and removes
+   * them.
   *
   * Uses the given resolution to determine size thresholds instead of any
   * that may be present in the pix.
@ -46,24 +48,21 @@ class LineFinder {
   * The output vertical_x and vertical_y contain a sum of the output vectors,
   * thereby giving the mean vertical direction.
   *
-   * The output vectors are owned by the list and Frozen (cannot refit) by
-   * having no boxes, as there is no need to refit or merge separator lines.
-   */
-  static void FindVerticalLines(int resolution,  Pix* pix,
-                                int* vertical_x, int* vertical_y,
-                                TabVector_LIST* vectors);
-
-  /**
-   * Finds horizontal line objects in the given pix.
-   *
-   * Uses the given resolution to determine size thresholds instead of any
-   * that may be present in the pix.
+   * If pix_music_mask != NULL, and music is detected, a mask of the staves
+   * and anything that is connected (bars, notes etc.) will be returned in
+   * pix_music_mask, the mask subtracted from pix, and the lines will not
+   * appear in v_lines or h_lines.
   *
   * The output vectors are owned by the list and Frozen (cannot refit) by
   * having no boxes, as there is no need to refit or merge separator lines.
+   *
+   * The detected lines are removed from the pix.
   */
-  static void FindHorizontalLines(int resolution,  Pix* pix,
-                                  TabVector_LIST* vectors);
+  static void FindAndRemoveLines(int resolution,  bool debug, Pix* pix,
+                                 int* vertical_x, int* vertical_y,
+                                 Pix** pix_music_mask,
+                                 TabVector_LIST* v_lines,
+                                 TabVector_LIST* h_lines);

  /**
   * Converts the Boxa array to a list of C_BLOB, getting rid of severely
@ -78,43 +77,71 @@ class LineFinder {
                                 Boxa** boxes, C_BLOB_LIST* blobs);

 private:
-  /**
-   * Finds vertical lines in the given list of BLOBNBOXes. bleft and tright
-   * are the bounds of the image on which the input line_bblobs were found.
-   *
-   * The input line_bblobs list is const really.
-   *
-   * The output vertical_x and vertical_y are the total of all the vectors.
-   * The output list of TabVector makes no reference to the input BLOBNBOXes.
-   */
+  // Finds vertical line objects in pix_vline and removes them from src_pix.
+  // Uses the given resolution to determine size thresholds instead of any
+  // that may be present in the pix.
+  // The output vertical_x and vertical_y contain a sum of the output vectors,
+  // thereby giving the mean vertical direction.
+  // The output vectors are owned by the list and Frozen (cannot refit) by
+  // having no boxes, as there is no need to refit or merge separator lines.
+  // If no good lines are found, pix_vline is destroyed.
+  static void FindAndRemoveVLines(int resolution,
+                                  Pix* pix_intersections,
+                                  int* vertical_x, int* vertical_y,
+                                  Pix** pix_vline, Pix* pix_non_vline,
+                                  Pix* src_pix, TabVector_LIST* vectors);
+
+
+  // Finds horizontal line objects in pix_vline and removes them from src_pix.
+  // Uses the given resolution to determine size thresholds instead of any
+  // that may be present in the pix.
+  // The output vertical_x and vertical_y contain a sum of the output vectors,
+  // thereby giving the mean vertical direction.
+  // The output vectors are owned by the list and Frozen (cannot refit) by
+  // having no boxes, as there is no need to refit or merge separator lines.
+  // If no good lines are found, pix_hline is destroyed.
+  static void FindAndRemoveHLines(int resolution,
+                                  Pix* pix_intersections,
+                                  int vertical_x, int vertical_y,
+                                  Pix** pix_hline, Pix* pix_non_hline,
+                                  Pix* src_pix, TabVector_LIST* vectors);
+
+  // Finds vertical lines in the given list of BLOBNBOXes. bleft and tright
+  // are the bounds of the image on which the input line_bblobs were found.
+  // The input line_bblobs list is const really.
+  // The output vertical_x and vertical_y are the total of all the vectors.
+  // The output list of TabVector makes no reference to the input BLOBNBOXes.
  static void FindLineVectors(const ICOORD& bleft, const ICOORD& tright,
                              BLOBNBOX_LIST* line_bblobs,
                              int* vertical_x, int* vertical_y,
                              TabVector_LIST* vectors);

-  /**
-   * Get a set of bounding boxes of possible vertical lines in the image.
-   *
-   * The input resolution overrides any resolution set in src_pix.
-   *
-   * The output line_pix contains just all the detected lines.
-   */
-  static Boxa* GetVLineBoxes(int resolution, Pix* src_pix, Pix** line_pix);
+  // Most of the heavy lifting of line finding. Given src_pix and its separate
+  // resolution, returns image masks:
+  // Returns image masks:
+  // pix_vline           candidate vertical lines.
+  // pix_non_vline       pixels that didn't look like vertical lines.
+  // pix_hline           candidate horizontal lines.
+  // pix_non_hline       pixels that didn't look like horizontal lines.
+  // pix_intersections   pixels where vertical and horizontal lines meet.
+  // pix_music_mask      candidate music staves.
+  // This function promises to initialize all the output (2nd level) pointers,
+  // but any of the returns that are empty will be NULL on output.
+  // None of the input (1st level) pointers may be NULL except pix_music_mask,
+  // which will disable music detection, and pixa_display, which is for debug.
+  static void GetLineMasks(int resolution, Pix* src_pix,
+                           Pix** pix_vline, Pix** pix_non_vline,
+                           Pix** pix_hline, Pix** pix_non_hline,
+                           Pix** pix_intersections, Pix** pix_music_mask,
+                           Pixa* pixa_display);

-  /**
-   * Get a set of bounding boxes of possible horizontal lines in the image.
-   *
-   * The input resolution overrides any resolution set in src_pix.
-   *
-   * The output line_pix contains just all the detected lines.
-   *
-   * The output boxes undergo the transformation (x,y)->(height-y,x) so the
-   * lines can be found with a vertical line finder afterwards.
-   *
-   * This transformation allows a simple x/y flip to reverse it in tesseract
-   * coordinates and it is faster to flip the lines than rotate the image.
-   */
-  static Boxa* GetHLineBoxes(int resolution, Pix* src_pix, Pix** line_pix);
+  // Returns a list of boxes corresponding to the candidate line segments. Sets
+  // the line_crossings member of the boxes so we can later determin the number
+  // of intersections touched by a full line.
+  static void GetLineBoxes(bool horizontal_lines,
+                           Pix* pix_lines, Pix* pix_intersections,
+                           C_BLOB_LIST* line_cblobs,
+                           BLOBNBOX_LIST* line_bblobs);
 };

 }  // namespace tesseract.
--- a/textord/makerow.cpp
+++ b/textord/makerow.cpp
@ -312,6 +312,9 @@ void compute_page_skew(                        //get average gradient
  blob_count = 0;
  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
       block_it.forward ()) {
+    POLY_BLOCK* pb = block_it.data()->block->poly_block();
+    if (pb != NULL && !pb->IsText())
+      continue;  // Pretend non-text blocks don't exist.
    row_count += block_it.data ()->get_rows ()->length ();
    //count up rows
    row_it.set_to_list (block_it.data ()->get_rows ());
@ -332,6 +335,9 @@ void compute_page_skew(                        //get average gradient
  row_index = 0;
  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
       block_it.forward ()) {
+    POLY_BLOCK* pb = block_it.data()->block->poly_block();
+    if (pb != NULL && !pb->IsText())
+      continue;  // Pretend non-text blocks don't exist.
    row_it.set_to_list (block_it.data ()->get_rows ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
@ -359,6 +365,9 @@ void compute_page_skew(                        //get average gradient
                                 //desperate
    for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
         block_it.forward ()) {
+      POLY_BLOCK* pb = block_it.data()->block->poly_block();
+      if (pb != NULL && !pb->IsText())
+        continue;  // Pretend non-text blocks don't exist.
      row_it.set_to_list (block_it.data ()->get_rows ());
      for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
           row_it.forward ()) {
@ -593,8 +602,11 @@ void Textord::cleanup_rows_fitting(ICOORD page_tr,    // top right
  if (textord_heavy_nr) {
    vigorous_noise_removal(block);
  }
-  separate_underlines(block, gradient, rotation, testing_on);
-  pre_associate_blobs(page_tr, block, rotation, testing_on);
+  POLY_BLOCK* pb = block->block->poly_block();
+  if (pb == NULL || pb->IsText()) {
+    separate_underlines(block, gradient, rotation, testing_on);
+    pre_associate_blobs(page_tr, block, rotation, testing_on);
+  }

 #ifndef GRAPHICS_DISABLED
  if (textord_show_final_rows && testing_on) {
--- a/textord/strokewidth.cpp
+++ b/textord/strokewidth.cpp
--- a/textord/strokewidth.h
+++ b/textord/strokewidth.h
@ -20,38 +20,55 @@
 #ifndef TESSERACT_TEXTORD_STROKEWIDTH_H__
 #define TESSERACT_TEXTORD_STROKEWIDTH_H__

-#include "bbgrid.h"         // Base class.
 #include "blobbox.h"        // BlobNeighourDir.
-#include "tabvector.h"      // For BLOBNBOX_CLIST.
+#include "blobgrid.h"         // Base class.
+#include "colpartitiongrid.h"
+#include "textlineprojection.h"

-class TO_BLOCK;
+class DENORM;
 class ScrollView;
+class TO_BLOCK;

 namespace tesseract {

 class ColPartition_LIST;
 class TabFind;
+class TextlineProjection;
+
+// Misc enums to clarify bool arguments for direction-controlling args.
+enum LeftOrRight {
+  LR_LEFT,
+  LR_RIGHT
+};

 /**
 * The StrokeWidth class holds all the normal and large blobs.
 * It is used to find good large blobs and move them to the normal blobs
 * by virtue of having a reasonable strokewidth compatible neighbour.
 */
-class StrokeWidth : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
+class StrokeWidth : public BlobGrid {
 public:
  StrokeWidth(int gridsize, const ICOORD& bleft, const ICOORD& tright);
  virtual ~StrokeWidth();

+  // Sets the neighbours member of the medium-sized blobs in the block.
+  // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
+  // blobs and sets pointers to the good neighbours.
+  void SetNeighboursOnMediumBlobs(TO_BLOCK* block);
+
+  // Sets the neighbour/textline writing direction members of the medium
+  // and large blobs with optional repair of broken CJK characters first.
+  // Repair of broken CJK is needed here because broken CJK characters
+  // can fool the textline direction detection algorithm.
+  void FindTextlineDirectionAndFixBrokenCJK(bool cjk_merge,
+                                            TO_BLOCK* input_block);
+
  // To save computation, the process of generating partitions is broken
  // into the following 4 steps:
  // TestVerticalTextDirection
  // CorrectForRotation (used only if a rotation is to be applied)
  // FindLeaderPartitions
-  // TODO(rays) Coming soon:
  // GradeBlobsIntoPartitions.
-  // which will replace entirely the old call sequence of:
-  // InsertBlobsOld
-  // MoveGoodLargeBlobs.
  // These functions are all required, in sequence, except for
  // CorrectForRotation, which is not needed if no rotation is applied.

@ -59,36 +76,50 @@ class StrokeWidth : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
  // returns true if the majority are vertical.
  // If the blobs are rotated, it is necessary to call CorrectForRotation
  // after rotating everything, otherwise the work done here will be enough.
-  // If cjk_merge is true, it will attempt to merge broken cjk characters.
  // If osd_blobs is not null, a list of blobs from the dominant textline
  // direction are returned for use in orientation and script detection.
-  bool TestVerticalTextDirection(bool cjk_merge,
-                                 TO_BLOCK* block, TabFind* line_grid,
+  bool TestVerticalTextDirection(TO_BLOCK* block,
                                 BLOBNBOX_CLIST* osd_blobs);

  // Corrects the data structures for the given rotation.
-  void CorrectForRotation(const FCOORD& rotation, TO_BLOCK* block);
+  void CorrectForRotation(const FCOORD& rerotation,
+                          ColPartitionGrid* part_grid);

  // Finds leader partitions and inserts them into the give grid.
-  void FindLeaderPartitions(TO_BLOCK* block, TabFind* line_grid);
+  void FindLeaderPartitions(TO_BLOCK* block,
+                            ColPartitionGrid* part_grid);
+
+  // Finds and marks noise those blobs that look like bits of vertical lines
+  // that would otherwise screw up layout analysis.
+  void RemoveLineResidue(ColPartition_LIST* big_part_list);
+
+  // Types all the blobs as vertical text or horizontal text or unknown and
+  // puts them into initial ColPartitions in the supplied part_grid.
+  // rerotation determines how to get back to the image coordinates from the
+  // blob coordinates (since they may have been rotated for vertical text).
+  // block is the single block for the whole page or rectangle to be OCRed.
+  // nontext_pix (full-size), is a binary mask used to prevent merges across
+  // photo/text boundaries. It is not kept beyond this function.
+  // denorm provides a mapping back to the image from the current blob
+  // coordinate space.
+  // projection provides a measure of textline density over the image and
+  // provides functions to assist with diacritic detection. It should be a
+  // pointer to a new TextlineProjection, and will be setup here.
+  // part_grid is the output grid of textline partitions.
+  // Large blobs that cause overlap are put in separate partitions and added
+  // to the big_parts list.
+  void GradeBlobsIntoPartitions(const FCOORD& rerotation,
+                                TO_BLOCK* block,
+                                Pix* nontext_pix,
+                                const DENORM* denorm,
+                                TextlineProjection* projection,
+                                ColPartitionGrid* part_grid,
+                                ColPartition_LIST* big_parts);

  // Handles a click event in a display window.
  virtual void HandleClick(int x, int y);

-  // Puts the block blobs (normal and large) into the grid.
-  void InsertBlobsOld(TO_BLOCK* block, TabFind* line_grid);
-
-  // Moves the large blobs that have good stroke-width neighbours to the normal
-  // blobs list.
-  void MoveGoodLargeBlobs(int resolution, TO_BLOCK* block);
-
 private:
-  // Reorganize the blob lists with a different definition of small, medium
-  // and large, compared to the original definition.
-  // Height is still the primary filter key, but medium width blobs of small
-  // height become medium, and very wide blobs of small height stay small.
-  void ReFilterBlobs(TO_BLOCK* block);
-
  // Computes the noise_density_ by summing the number of elements in a
  // neighbourhood of each grid cell.
  void ComputeNoiseDensity(TO_BLOCK* block, TabFind* line_grid);
@ -96,20 +127,25 @@ class StrokeWidth : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
  // Detects and marks leader dots/dashes.
  //    Leaders are horizontal chains of small or noise blobs that look
  //    monospace according to ColPartition::MarkAsLeaderIfMonospaced().
-  // Detected leaders become the only occupants of small_blobs list.
+  // Detected leaders become the only occupants of the block->small_blobs list.
  // Non-leader small blobs get moved to the blobs list.
  // Non-leader noise blobs remain singletons in the noise list.
  // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
-  void FindLeadersAndMarkNoise(bool final, TO_BLOCK* block, TabFind* line_grid,
+  // block is the single block for the whole page or rectangle to be OCRed.
+  // leader_parts is the output.
+  void FindLeadersAndMarkNoise(TO_BLOCK* block,
                               ColPartition_LIST* leader_parts);

-  // Puts the block blobs (normal and large) into the grid.
-  void InsertBlobs(TO_BLOCK* block, TabFind* line_grid);
+  /** Inserts the block blobs (normal and large) into this grid.
+   * Blobs remain owned by the block. */
+  void InsertBlobs(TO_BLOCK* block);

  // Fix broken CJK characters, using the fake joined blobs mechanism.
  // Blobs are really merged, ie the master takes all the outlines and the
  // others are deleted.
-  void FixBrokenCJK(BLOBNBOX_LIST* blobs, TabFind* line_grid);
+  // Returns true if sufficient blobs are merged that it may be worth running
+  // again, due to a better estimate of character size.
+  bool FixBrokenCJK(TO_BLOCK* block);

  // Collect blobs that overlap or are within max_dist of the input bbox.
  // Return them in the list of blobs and expand the bbox to be the union
@ -119,16 +155,21 @@ class StrokeWidth : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
                          int max_size, int max_dist,
                          TBOX* bbox, BLOBNBOX_CLIST* blobs);

-  // Finds the textline direction to be horizontal or vertical according
-  // to distance to neighbours and 1st and 2nd order neighbours.
-  // Non-text tends to end up without a definite direction.
-  void FindTextlineFlowDirection(bool final);
+  // For each blob in this grid, Finds the textline direction to be horizontal
+  // or vertical according to distance to neighbours and 1st and 2nd order
+  // neighbours. Non-text tends to end up without a definite direction.
+  // Result is setting of the neighbours and vert_possible/horz_possible
+  // flags in the BLOBNBOXes currently in this grid.
+  // This function is called more than once if page orientation is uncertain,
+  // so display_if_debugging is true on the final call to display the results.
+  void FindTextlineFlowDirection(bool display_if_debugging);

  // Sets the neighbours and good_stroke_neighbours members of the blob by
  // searching close on all 4 sides.
  // When finding leader dots/dashes, there is a slightly different rule for
  // what makes a good neighbour.
-  void SetNeighbours(bool leaders, BLOBNBOX* blob);
+  // If activate_line_trap, then line-like objects are found and isolated.
+  void SetNeighbours(bool leaders, bool activate_line_trap, BLOBNBOX* blob);

  // Sets the good_stroke_neighbours member of the blob if it has a
  // GoodNeighbour on the given side.
@ -151,26 +192,111 @@ class StrokeWidth : public BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> {
  // changed. Otherwise, only ambiguous blobs are processed.
  void SmoothNeighbourTypes(BLOBNBOX* blob, bool desperate);

-  // Sets the leader_on_left or leader_on_right flags for blobs
-  // that are next to one end of the given leader partition.
-  // If left_of_part is true, then look at the left side of the partition for
-  // blobs on which to set the leader_on_right flag.
-  void MarkLeaderNeighbours(const ColPartition* part, bool left_of_part);
+  // Checks the left or right side of the given leader partition and sets the
+  // (opposite) leader_on_right or leader_on_left flags for blobs
+  // that are next to the given side of the given leader partition.
+  void MarkLeaderNeighbours(const ColPartition* part, LeftOrRight side);
+
+  // Partition creation. Accumulates vertical and horizontal text chains,
+  // puts the remaining blobs in as unknowns, and then merges/splits to
+  // minimize overlap and smoothes the types with neighbours and the color
+  // image if provided. rerotation is used to rotate the coordinate space
+  // back to the nontext_map_ image.
+  void FindInitialPartitions(const FCOORD& rerotation,
+                             TO_BLOCK* block,
+                             ColPartitionGrid* part_grid,
+                             ColPartition_LIST* big_parts);
+  // Finds vertical chains of text-like blobs and puts them in ColPartitions.
+  void FindVerticalTextChains(ColPartitionGrid* part_grid);
+  // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
+  void FindHorizontalTextChains(ColPartitionGrid* part_grid);
+  // Finds diacritics and saves their base character in the blob.
+  void TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block);
+  // Searches this grid for an appropriately close and sized neighbour of the
+  // given [small] blob. If such a blob is found, the diacritic base is saved
+  // in the blob and true is returned.
+  // The small_grid is a secondary grid that contains the small/noise objects
+  // that are not in this grid, but may be useful for determining a connection
+  // between blob and its potential base character. (See DiacriticXGapFilled.)
+  bool DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob);
+  // Returns true if there is no gap between the base char and the diacritic
+  // bigger than a fraction of the height of the base char:
+  // Eg: line end.....'
+  // The quote is a long way from the end of the line, yet it needs to be a
+  // diacritic. To determine that the quote is not part of an image, or
+  // a different text block, we check for other marks in the gap between
+  // the base char and the diacritic.
+  //                          '<--Diacritic
+  // |---------|
+  // |         |<-toobig-gap->
+  // | Base    |<ok gap>
+  // |---------|        x<-----Dot occupying gap
+  // The grid is const really.
+  bool DiacriticXGapFilled(BlobGrid* grid, const TBOX& diacritic_box,
+                           const TBOX& base_box);
+  // Merges diacritics with the ColPartition of the base character blob.
+  void MergeDiacritics(TO_BLOCK* block, ColPartitionGrid* part_grid);
+  // Any blobs on the large_blobs list of block that are still unowned by a
+  // ColPartition, are probably drop-cap or vertically touching so the blobs
+  // are removed to the big_parts list and treated separately.
+  void RemoveLargeUnusedBlobs(TO_BLOCK* block,
+                              ColPartitionGrid* part_grid,
+                              ColPartition_LIST* big_parts);
+
+    // All remaining unused blobs are put in individual ColPartitions.
+  void PartitionRemainingBlobs(ColPartitionGrid* part_grid);
+
+  // If combine, put all blobs in the cell_list into a single partition,
+  // otherwise put each one into its own partition.
+  void MakePartitionsFromCellList(bool combine,
+                                  ColPartitionGrid* part_grid,
+                                  BLOBNBOX_CLIST* cell_list);
+
+  // Helper function to finish setting up a ColPartition and insert into
+  // part_grid.
+  void CompletePartition(ColPartition* part, ColPartitionGrid* part_grid);
+
+  // Merge partitions where the merge appears harmless.
+  void EasyMerges(ColPartitionGrid* part_grid);
+
+  // Compute a search box based on the orientation of the partition.
+  // Returns true if a suitable box can be calculated.
+  // Callback for EasyMerges.
+  bool OrientationSearchBox(ColPartition* part, TBOX* box);
+
+  // Merge confirmation callback for EasyMerges.
+  bool ConfirmEasyMerge(const ColPartition* p1, const ColPartition* p2);
+
+  // Returns true if there is no significant noise in between the boxes.
+  bool NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const;

  // Displays the blobs colored according to the number of good neighbours
  // and the vertical/horizontal flow.
  ScrollView* DisplayGoodBlobs(const char* window_name, int x, int y);

+  // Displays blobs colored according to whether or not they are diacritics.
+  ScrollView* DisplayDiacritics(const char* window_name,
+                                int x, int y, TO_BLOCK* block);
+
 private:
-  // Returns true if there is at least one side neighbour that has a similar
-  // stroke width.
-  bool GoodTextBlob(BLOBNBOX* blob);
-  // Grid to indicate the dot noise density at each grid coord.
-  IntGrid* noise_density_;
+  // Image map of photo/noise areas on the page. Borrowed pointer (not owned.)
+  Pix* nontext_map_;
+  // Textline projection map. Borrowed pointer.
+  TextlineProjection* projection_;
+  // DENORM used by projection_ to get back to image coords. Borrowed pointer.
+  const DENORM* denorm_;
+  // Bounding box of the grid.
+  TBOX grid_box_;
+  // Rerotation to get back to the original image.
+  FCOORD rerotation_;
  // Windows for debug display.
  ScrollView* leaders_win_;
  ScrollView* initial_widths_win_;
  ScrollView* widths_win_;
+  ScrollView* chains_win_;
+  ScrollView* diacritics_win_;
+  ScrollView* textlines_win_;
+  ScrollView* smoothed_win_;
 };

 }  // namespace tesseract.
--- a/textord/tabfind.cpp
+++ b/textord/tabfind.cpp
--- a/textord/tabfind.h
+++ b/textord/tabfind.h
@ -45,6 +45,7 @@ namespace tesseract {
 typedef TessResultCallback1<bool, int> WidthCallback;

 struct AlignedBlobParams;
+class ColPartitionGrid;

 /** Pixel resolution of column width estimates. */
 const int kColumnWidthFactor = 20;
@ -67,30 +68,40 @@ class TabFind : public AlignedBlob {

  /**
   * Insert a list of blobs into the given grid (not necessarily this).
-   * If take_ownership is true, then the blobs are removed from the source list.
   * See InsertBlob for the other arguments.
+   * It would seem to make more sense to swap this and grid, but this way
+   * around allows grid to not be derived from TabFind, eg a ColPartitionGrid,
+   * while the grid that provides the tab stops(this) has to be derived from
+   * TabFind.
   */
-  void InsertBlobList(bool h_spread, bool v_spread, bool large,
-                      BLOBNBOX_LIST* blobs, bool take_ownership,
-                      BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>* grid);
+  void InsertBlobsToGrid(bool h_spread, bool v_spread,
+                         BLOBNBOX_LIST* blobs,
+                         BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>* grid);

  /**
   * Insert a single blob into the given grid (not necessarily this).
   * If h_spread, then all cells covered horizontally by the box are
   * used, otherwise, just the bottom-left. Similarly for v_spread.
-   * If large, then insert only if the bounding box doesn't intersect
-   * anything else already in the grid. Returns true if the blob was inserted.
   * A side effect is that the left and right rule edges of the blob are
   * set according to the tab vectors in this (not grid).
   */
-  bool InsertBlob(bool h_spread, bool v_spread, bool large, BLOBNBOX* blob,
+  bool InsertBlob(bool h_spread, bool v_spread, BLOBNBOX* blob,
                  BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>* grid);
-
+  // Calls SetBlobRuleEdges for all the blobs in the given block.
+  void SetBlockRuleEdges(TO_BLOCK* block);
+  // Sets the left and right rule and crossing_rules for the blobs in the given
+  // list by finding the next outermost tabvectors for each blob.
+  void SetBlobRuleEdges(BLOBNBOX_LIST* blobs);

  // Returns the gutter width of the given TabVector between the given y limits.
  // Also returns x-shift to be added to the vector to clear any intersecting
  // blobs. The shift is deducted from the returned gutter.
+  // If ignore_unmergeables is true, then blobs of UnMergeableType are
+  // ignored as if they don't exist. (Used for text on image.)
+  // max_gutter_width is used as the maximum width worth searching for in case
+  // there is nothing near the TabVector.
  int GutterWidth(int bottom_y, int top_y, const TabVector& v,
+                  bool ignore_unmergeables, int max_gutter_width,
                  int* required_shift);
  /**
   * Find the gutter width and distance to inner neighbour for the given blob.
@ -100,20 +111,6 @@ class TabFind : public AlignedBlob {
                                  BLOBNBOX* bbox, int* gutter_width,
                                  int* neighbour_gap);

-  /**
-   * Find the next adjacent (to left or right) blob on this text line,
-   * with the constraint that it must vertically significantly overlap
-   * the input box.
-   */
-  BLOBNBOX* AdjacentBlob(const BLOBNBOX* bbox,
-                         bool right_to_left, int gap_limit);
-
-  /**
-   * Compute and return, but do not set the type as being BRT_TEXT or
-   * BRT_UNKNOWN according to how well it forms a text line.
-   */
-  BlobRegionType ComputeBlobType(BLOBNBOX* blob);
-
  /**
   * Return the x-coord that corresponds to the right edge for the given
   * box. If there is a rule line to the right that vertically overlaps it,
@ -192,16 +189,24 @@ class TabFind : public AlignedBlob {
  /**
   * Top-level function to find TabVectors in an input page block.
   * Returns false if the detected skew angle is impossible.
+   * Applies the detected skew angle to deskew the tabs, blobs and part_grid.
   */
  bool FindTabVectors(TabVector_LIST* hlines,
                      BLOBNBOX_LIST* image_blobs, TO_BLOCK* block,
                      int min_gutter_width,
+                      ColPartitionGrid* part_grid,
                      FCOORD* deskew, FCOORD* reskew);

  // Top-level function to not find TabVectors in an input page block,
  // but setup for single column mode.
  void DontFindTabVectors(BLOBNBOX_LIST* image_blobs,
                          TO_BLOCK* block, FCOORD* deskew, FCOORD* reskew);
+
+  // Cleans up the lists of blobs in the block ready for use by TabFind.
+  // Large blobs that look like text are moved to the main blobs list.
+  // Main blobs that are superseded by the image blobs are deleted.
+  void TidyBlobs(TO_BLOCK* block);
+
  // Helper function to setup search limits for *TabForBox.
  void SetupTabSearch(int x, int y, int* min_key, int* max_key);

@ -229,15 +234,33 @@ class TabFind : public AlignedBlob {
                            TabVector_LIST* horizontal_lines,
                            int* min_gutter_width);

+  // Clear the grid and get rid of the tab vectors, but not separators,
+  // ready to start again.
+  void Reset();
+
+  // Reflect the separator tab vectors and the grids in the y-axis.
+  // Can only be called after Reset!
+  void ReflectInYAxis();
+
 private:
  // For each box in the grid, decide whether it is a candidate tab-stop,
-  // and if so add it to the tab_grid_.
+  // and if so add it to the left and right tab boxes.
  ScrollView* FindTabBoxes(int min_gutter_width);

  // Return true if this box looks like a candidate tab stop, and set
  // the appropriate tab type(s) to TT_UNCONFIRMED.
  bool TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width);

+  // Returns true if there is nothing in the rectangle of width min_gutter to
+  // the left of bbox.
+  bool ConfirmRaggedLeft(BLOBNBOX* bbox, int min_gutter);
+  // Returns true if there is nothing in the rectangle of width min_gutter to
+  // the right of bbox.
+  bool ConfirmRaggedRight(BLOBNBOX* bbox, int min_gutter);
+  // Returns true if there is nothing in the given search_box that vertically
+  // overlaps target_box other than target_box itself.
+  bool NothingYOverlapsInBox(const TBOX& search_box, const TBOX& target_box);
+
  // Fills the list of TabVector with the tabstops found in the grid,
  // and estimates the logical vertical direction.
  void FindAllTabVectors(int min_gutter_width);
@ -272,13 +295,17 @@ class TabFind : public AlignedBlob {
  // Trace textlines from one side to the other of each tab vector, saving
  // the most frequent column widths found in a list so that a given width
  // can be tested for being a common width with a simple callback function.
-  void ComputeColumnWidths(ScrollView* tab_win);
+  void ComputeColumnWidths(ScrollView* tab_win,
+                           ColPartitionGrid* part_grid);

-  // Set the region_type_ member for all the blobs in the grid.
-  void ComputeBlobGoodness();
+  // Find column width and pair-up tab vectors with existing ColPartitions.
+  void ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
+                                     STATS* col_widths);

-  // Set the region_type_ member of the blob, if not already known.
-  void SetBlobRegionType(BLOBNBOX* blob);
+  // Helper makes the list of common column widths in column_widths_ from the
+  // input col_widths. Destroys the content of col_widths by repeatedly
+  // finding the mode and erasing the peak.
+  void MakeColumnWidths(int col_widths_size, STATS* col_widths);

  // Mark blobs as being in a vertical text line where that is the case.
  void MarkVerticalText();
@ -288,48 +315,14 @@ class TabFind : public AlignedBlob {
  // points (< kMinLinesInColumn), then 0 is returned.
  int FindMedianGutterWidth(TabVector_LIST* tab_vectors);

-  // If this box looks like it is on a textline in the given direction,
-  // return the width of the textline-like group of blobs, and the number
-  // of blobs found.
-  // For more detail see FindTextlineSegment below.
-  int FindTextlineWidth(bool right_to_left, BLOBNBOX* bbox, int* blob_count);
-
-  // Search from the given tabstop bbox to the next opposite
-  // tabstop bbox on the same text line, which may be itself.
-  // Returns true if the search is successful, and sets
-  // start_pt, end_pt to the fitted baseline, width to the measured
-  // width of the text line (column width estimate.)
-  bool TraceTextline(BLOBNBOX* bbox, ICOORD* start_pt, ICOORD* end_pt,
-                     int* left_edge, int* right_edge);
-
-  // Search from the given bbox in the given direction until the next tab
-  // vector is found or a significant horizontal gap is found.
-  // Returns the width of the line if the search is successful, (defined
-  // as good coverage of the width and a good fitting baseline) and sets
-  // start_pt, end_pt to the fitted baseline, left_blob, right_blob to
-  // the ends of the line. Returns zero otherwise.
-  // Sets blob_count to the number of blobs found on the line.
-  // On input, either both left_vector and right_vector should be NULL,
-  // indicating a basic search, or both left_vector and right_vector should
-  // be not NULL and one of *left_vector and *right_vector should be not NULL,
-  // in which case the search is strictly between tab vectors and will return
-  // zero if a gap is found before the opposite tab vector is reached, or a
-  // conflicting tab vector is found.
-  // If ignore_images is true, then blobs with aligned_text() < 0 are treated
-  // as if they do not exist.
-  int FindTextlineSegment(bool right_to_lefts, bool ignore_images,
-                          BLOBNBOX* bbox, int* blob_count,
-                          ICOORD* start_pt, ICOORD* end_pt,
-                          TabVector** left_vector, TabVector** right_vector,
-                          BLOBNBOX** left_blob, BLOBNBOX** right_blob);
-
  // Find the next adjacent (to left or right) blob on this text line,
  // with the constraint that it must vertically significantly overlap
  // the [top_y, bottom_y] range.
  // If ignore_images is true, then blobs with aligned_text() < 0 are treated
  // as if they do not exist.
  BLOBNBOX* AdjacentBlob(const BLOBNBOX* bbox,
-                         bool right_to_left, bool ignore_images,
+                         bool look_left, bool ignore_images,
+                         double min_overlap_fraction,
                         int gap_limit, int top_y, int bottom_y);

  // Add a bi-directional partner relationship between the left
@ -373,8 +366,9 @@ class TabFind : public AlignedBlob {
  ICOORDELT_LIST column_widths_;  //< List of commonly occurring widths.
  /** Callback to test an int for being a common width. */
  WidthCallback* width_cb_;
-  /** Instance of the base class that contains only candidate tab stops. */
-  BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>* tab_grid_;
+  // Sets of bounding boxes that are candidate tab stops.
+  GenericVector<BLOBNBOX*> left_tab_boxes_;
+  GenericVector<BLOBNBOX*> right_tab_boxes_;
 };

 }  // namespace tesseract.
--- a/textord/tablefind.cpp
+++ b/textord/tablefind.cpp
@ -970,7 +970,7 @@ bool TableFinder::HasLeaderAdjacent(const ColPartition& part) {
      if (!part.IsInSameColumnAs(*leader))
        break;
      // There should be a significant vertical overlap
-      if (!leader->VOverlaps(part))
+      if (!leader->VSignificantCoreOverlap(part))
        continue;
      // Leader passed all tests, so it is adjacent.
      return true;
@ -2112,9 +2112,17 @@ void TableFinder::MakeTableBlocks(ColPartitionGrid* grid,
    }
    // Insert table colpartition back to part_grid_
    if (table_partition) {
-      table_partition->SetPartitionType(resolution_,
-                                        all_columns[table_search.GridY()]);
+      // To match the columns used when transforming to blocks, the new table
+      // partition must have its first and last column set at the grid y that
+      // corresponds to its bottom.
+      const TBOX& table_box = table_partition->bounding_box();
+      int grid_x, grid_y;
+      grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
+      table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
      table_partition->set_table_type();
+      table_partition->set_blob_type(BRT_TEXT);
+      table_partition->set_flow(BTFT_CHAIN);
+      table_partition->SetBlobTypes();
      grid->InsertBBox(true, true, table_partition);
    }
  }
--- a/textord/tabvector.cpp
+++ b/textord/tabvector.cpp
@ -26,6 +26,7 @@
 #include "colfind.h"
 #include "colpartitionset.h"
 #include "detlinefit.h"
+#include "statistc.h"

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
@ -52,7 +53,7 @@ const double kLineCountReciprocal = 4.0;
 // Constant add-on for minimum gutter for aligned tabs.
 const double kMinAlignedGutter = 0.25;
 // Constant add-on for minimum gutter for ragged tabs.
-const double kMinRaggedGutter = 2.0;
+const double kMinRaggedGutter = 1.5;

 double_VAR(textord_tabvector_vertical_gap_fraction, 0.5,
  "max fraction of mean blob width allowed for vertical gaps in vertical text");
@ -205,7 +206,8 @@ TabVector::TabVector(const TabVector& src, TabAlignment alignment,
                     const ICOORD& vertical_skew, BLOBNBOX* blob)
  : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
    sort_key_(0), percent_score_(0), mean_width_(0),
-    needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
+    needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false),
+    alignment_(alignment),
    top_constraints_(NULL), bottom_constraints_(NULL) {
  BLOBNBOX_C_IT it(&boxes_);
  it.add_to_end(blob);
@ -236,6 +238,7 @@ TabVector* TabVector::ShallowCopy() const {
  copy->alignment_ = alignment_;
  copy->extended_ymax_ = extended_ymax_;
  copy->extended_ymin_ = extended_ymin_;
+  copy->intersects_other_lines_ = intersects_other_lines_;
  return copy;
 }

@ -373,6 +376,9 @@ void TabVector::MergeSimilarTabVectors(const ICOORD& vertical,
          v1->Print("by deleting");
        }
        v2->MergeWith(vertical, it1.extract());
+        if (textord_debug_tabfind) {
+          v2->Print("Producing");
+        }
        ICOORD merged_vector = v2->endpt();
        merged_vector -= v2->startpt();
        if (abs(merged_vector.x()) > 100) {
@ -604,13 +610,19 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
    ++height_count;
  }
  mean_height /= height_count;
+  int max_gutter = kGutterMultiple * mean_height;
+  if (IsRagged()) {
+    // Ragged edges face a tougher test in that the gap must always be within
+    // the height of the blob.
+    max_gutter = kGutterToNeighbourRatio * mean_height;
+  }

+  STATS gutters(0, max_gutter + 1);
  // Evaluate the boxes for their goodness, calculating the coverage as we go.
  // Remove boxes that are not good and shorten the list to the first and
  // last good boxes.
-  bool deleted_a_box = false;
-  int mean_gutter = 0;
-  int gutter_count = 0;
+  int num_deleted_boxes = 0;
+  bool text_on_image = false;
  int good_length = 0;
  const TBOX* prev_good_box = NULL;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
@ -618,8 +630,10 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
    const TBOX& box = bbox->bounding_box();
    int mid_y = (box.top() + box.bottom()) / 2;
    if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
-      if (!debug)
+      if (!debug) {
+        tprintf("After already deleting %d boxes, ", num_deleted_boxes);
        Print("Starting evaluation");
+      }
      debug = true;
    }
    // A good box is one where the nearest neighbour on the inside is closer
@ -627,17 +641,11 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
    // (of the putative column).
    bool left = IsLeftTab();
    int tab_x = XAtY(mid_y);
-    int max_gutter = kGutterMultiple * mean_height;
-    if (IsRagged()) {
-      // Ragged edges face a tougher test in that the gap must always be within
-      // the height of the blob.
-      max_gutter = kGutterToNeighbourRatio * mean_height;
-    }
    int gutter_width;
    int neighbour_gap;
    finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
                                       bbox, &gutter_width, &neighbour_gap);
-    if (TabFind::WithinTestRegion(2, tab_x, mid_y)) {
+    if (debug) {
      tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
              box.left(), box.bottom(), box.right(), box.top(),
              gutter_width, neighbour_gap);
@ -646,8 +654,7 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
    if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
      // A good box contributes its height to the good_length.
      good_length += box.top() - box.bottom();
-      mean_gutter += gutter_width;
-      ++gutter_count;
+      gutters.add(gutter_width, 1);
      // Two good boxes together contribute the gap between them
      // to the good_length as well, as long as the gap is not
      // too big.
@ -667,6 +674,8 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
        SetYStart(box.bottom());
      }
      prev_good_box = &box;
+      if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
+        text_on_image = true;
    } else {
      // Get rid of boxes that are not good.
      if (debug) {
@ -675,7 +684,7 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
                gutter_width, neighbour_gap);
      }
      it.extract();
-      deleted_a_box = true;
+      ++num_deleted_boxes;
    }
  }
  if (debug) {
@ -684,8 +693,10 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
  // If there are any good boxes, do it again, except this time get rid of
  // boxes that have a gutter that is a small fraction of the mean gutter.
  // This filters out ends that run into a coincidental gap in the text.
-  if (gutter_count > 0) {
-    mean_gutter /= gutter_count;
+  int search_top = endpt_.y();
+  int search_bottom = startpt_.y();
+  int median_gutter = IntCastRounded(gutters.median());
+  if (gutters.get_total() > 0) {
    prev_good_box = NULL;
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      BLOBNBOX* bbox = it.data();
@ -706,21 +717,23 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
      finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
                                         bbox, &gutter_width, &neighbour_gap);
      // Now we can make the test.
-      if (gutter_width >= mean_gutter * kMinGutterFraction) {
+      if (gutter_width >= median_gutter * kMinGutterFraction) {
        if (prev_good_box == NULL) {
          // Adjust the start to the first good box.
          SetYStart(box.bottom());
+          search_bottom = box.top();
        }
        prev_good_box = &box;
+        search_top = box.bottom();
      } else {
        // Get rid of boxes that are not good.
-        if (TabFind::WithinTestRegion(2, tab_x, mid_y)) {
+        if (debug) {
          tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
                  box.left(), box.bottom(), box.right(), box.top(),
-                  gutter_width, mean_gutter);
+                  gutter_width, median_gutter);
        }
        it.extract();
-        deleted_a_box = true;
+        ++num_deleted_boxes = true;
      }
    }
  }
@ -730,7 +743,7 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
    // Compute the percentage of the vector that is occupied by good boxes.
    int length = endpt_.y() - startpt_.y();
    percent_score_ = 100 * good_length / length;
-    if (deleted_a_box) {
+    if (num_deleted_boxes > 0) {
      needs_refit_ = true;
      FitAndEvaluateIfNeeded(vertical, finder);
      if (boxes_.empty())
@ -738,11 +751,19 @@ void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
    }
    // Test the gutter over the whole vector, instead of just at the boxes.
    int required_shift;
-    int gutter_width = finder->GutterWidth(startpt_.y(), endpt_.y(), *this,
-                                           &required_shift);
+    if (search_bottom > search_top) {
+      search_bottom = startpt_.y();
+      search_top = endpt_.y();
+    }
    double min_gutter_width = kLineCountReciprocal / boxes_.length();
    min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
    min_gutter_width *= mean_height;
+    int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
+    if (median_gutter > max_gutter_width)
+      max_gutter_width = median_gutter;
+    int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
+                                           text_on_image, max_gutter_width,
+                                           &required_shift);
    if (gutter_width < min_gutter_width) {
      if (debug) {
        tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
--- a/textord/tabvector.h
+++ b/textord/tabvector.h
@ -20,6 +20,7 @@
 #ifndef TESSERACT_TEXTORD_TABVECTOR_H__
 #define TESSERACT_TEXTORD_TABVECTOR_H__

+#include "blobgrid.h"
 #include "clst.h"
 #include "elst.h"
 #include "elst2.h"
@ -29,8 +30,6 @@
 class BLOBNBOX;
 class ScrollView;

-CLISTIZEH(BLOBNBOX)
-
 namespace tesseract {


@ -56,8 +55,6 @@ enum TabAlignment {
 class TabFind;
 class TabVector;
 class TabConstraint;
-typedef BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> BlobGrid;
-typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> BlobGridSearch;

 ELIST2IZEH(TabVector)
 CLISTIZEH(TabVector)
@ -179,6 +176,12 @@ class TabVector : public ELIST2_LINK {
  void set_endpt(const ICOORD& end) {
    endpt_ = end;
  }
+  bool intersects_other_lines() const {
+    return intersects_other_lines_;
+  }
+  void set_intersects_other_lines(bool value) {
+    intersects_other_lines_ = value;
+  }

  // Inline quasi-accessors that require some computation.

@ -258,6 +261,21 @@ class TabVector : public ELIST2_LINK {
    endpt_.set_x(x);
  }

+  // Reflect the tab vector in the y-axis.
+  void ReflectInYAxis() {
+    startpt_.set_x(-startpt_.x());
+    endpt_.set_x(-endpt_.x());
+    sort_key_ = -sort_key_;
+    if (alignment_ == TA_LEFT_ALIGNED)
+      alignment_ = TA_RIGHT_ALIGNED;
+    else if (alignment_ == TA_RIGHT_ALIGNED)
+      alignment_ = TA_LEFT_ALIGNED;
+    if (alignment_ == TA_LEFT_RAGGED)
+      alignment_ = TA_RIGHT_RAGGED;
+    else if (alignment_ == TA_RIGHT_RAGGED)
+      alignment_ = TA_LEFT_RAGGED;
+  }
+
  // Separate function to compute the sort key for a given coordinate pair.
  static int SortKey(const ICOORD& vertical, int x, int y) {
    ICOORD pt(x, y);
@ -393,6 +411,8 @@ class TabVector : public ELIST2_LINK {
  bool needs_refit_;
  // True if a fit has been done, so re-evaluation is needed.
  bool needs_evaluation_;
+  // True if a separator line intersects at least 2 other lines.
+  bool intersects_other_lines_;
  // The type of this TabVector.
  TabAlignment alignment_;
  // The list of boxes whose edges are aligned at this TabVector.
--- a/textord/textlineprojection.cpp
+++ b/textord/textlineprojection.cpp
@ -0,0 +1,764 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textlineprojection.h"
+#include "allheaders.h"
+#include "bbgrid.h"         // Base class.
+#include "blobbox.h"        // BlobNeighourDir.
+#include "blobs.h"
+#include "colpartition.h"
+#include "normalis.h"
+
+// Padding factor to use on definitely oriented blobs
+const int kOrientedPadFactor = 8;
+// Padding factor to use on not definitely oriented blobs.
+const int kDefaultPadFactor = 2;
+// Penalty factor for going away from the line center.
+const int kWrongWayPenalty = 4;
+// Ratio between parallel gap and perpendicular gap used to measure total
+// distance of a box from a target box in curved textline space.
+// parallel-gap is treated more favorably by this factor to allow catching
+// quotes and elipsis at the end of textlines.
+const int kParaPerpDistRatio = 4;
+// Multiple of scale_factor_ that the inter-line gap must be before we start
+// padding the increment box perpendicular to the text line.
+const int kMinLineSpacingFactor = 4;
+// Maximum tab-stop overrun for horizontal padding, in projection pixels.
+const int kMaxTabStopOverrun = 6;
+
+namespace tesseract {
+
+TextlineProjection::TextlineProjection(int resolution)
+  : x_origin_(0), y_origin_(0), pix_(NULL) {
+  // The projection map should be about 100 ppi, whatever the input.
+  scale_factor_ = IntCastRounded(resolution / 100.0);
+  if (scale_factor_ < 1) scale_factor_ = 1;
+}
+TextlineProjection::~TextlineProjection() {
+  pixDestroy(&pix_);
+}
+
+// Build the projection profile given the input_block containing lists of
+// blobs, a rotation to convert to image coords,
+// and a full-resolution nontext_map, marking out areas to avoid.
+// During construction, we have the following assumptions:
+// The rotation is a multiple of 90 degrees, ie no deskew yet.
+// The blobs have had their left and right rules set to also limit
+// the range of projection.
+void TextlineProjection::ConstructProjection(TO_BLOCK* input_block,
+                                             const FCOORD& rotation,
+                                             Pix* nontext_map) {
+  pixDestroy(&pix_);
+  TBOX image_box(0, 0, pixGetWidth(nontext_map), pixGetHeight(nontext_map));
+  x_origin_ = 0;
+  y_origin_ = image_box.height();
+  int width = (image_box.width() + scale_factor_ - 1) / scale_factor_;
+  int height = (image_box.height() + scale_factor_ - 1) / scale_factor_;
+
+  pix_ = pixCreate(width, height, 8);
+  ProjectBlobs(&input_block->blobs, rotation, image_box, nontext_map);
+  ProjectBlobs(&input_block->large_blobs, rotation, image_box, nontext_map);
+  Pix* final_pix = pixBlockconv(pix_, 1, 1);
+//  Pix* final_pix = pixBlockconv(pix_, 2, 2);
+  pixDestroy(&pix_);
+  pix_ = final_pix;
+}
+
+// Display the blobs in the window colored according to textline quality.
+void TextlineProjection::PlotGradedBlobs(BLOBNBOX_LIST* blobs,
+                                         ScrollView* win) {
+  BLOBNBOX_IT it(blobs);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    BLOBNBOX* blob = it.data();
+    const TBOX& box = blob->bounding_box();
+    bool bad_box = BoxOutOfHTextline(box, NULL, false);
+    if (blob->UniquelyVertical())
+      win->Pen(ScrollView::YELLOW);
+    else
+      win->Pen(bad_box ? ScrollView::RED : ScrollView::BLUE);
+    win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
+  }
+  win->Update();
+}
+
+// Moves blobs that look like they don't sit well on a textline from the
+// input blobs list to the output small_blobs list.
+// This gets them away from initial textline finding to stop diacritics
+// from forming incorrect textlines. (Introduced mainly to fix Thai.)
+void TextlineProjection::MoveNonTextlineBlobs(
+    BLOBNBOX_LIST* blobs, BLOBNBOX_LIST* small_blobs) const {
+  BLOBNBOX_IT it(blobs);
+  BLOBNBOX_IT small_it(small_blobs);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    BLOBNBOX* blob = it.data();
+    const TBOX& box = blob->bounding_box();
+    bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
+                                               box.bottom());
+    if (BoxOutOfHTextline(box, NULL, debug) && !blob->UniquelyVertical()) {
+      blob->ClearNeighbours();
+      small_it.add_to_end(it.extract());
+    }
+  }
+}
+
+// Create a window and display the projection in it.
+void TextlineProjection::DisplayProjection() const {
+  int width = pixGetWidth(pix_);
+  int height = pixGetHeight(pix_);
+  Pix* pixc = pixCreate(width, height, 32);
+  int src_wpl = pixGetWpl(pix_);
+  int col_wpl = pixGetWpl(pixc);
+  uinT32* src_data = pixGetData(pix_);
+  uinT32* col_data = pixGetData(pixc);
+  for (int y = 0; y < height; ++y, src_data += src_wpl, col_data += col_wpl) {
+    for (int x = 0; x < width; ++x) {
+      int pixel = GET_DATA_BYTE(src_data, x);
+      l_uint32 result;
+      if (pixel <= 17)
+        composeRGBPixel(0, 0, pixel * 15, &result);
+      else if (pixel <= 145)
+        composeRGBPixel(0, (pixel - 17) * 2, 255, &result);
+      else
+        composeRGBPixel((pixel - 145) * 2, 255, 255, &result);
+      col_data[x] = result;
+    }
+  }
+#if 0
+  // TODO(rays) uncomment when scrollview can display non-binary images.
+  ScrollView* win = new ScrollView("Projection", 0, 0,
+                                   width, height, width, height);
+  win->Image(pixc, 0, 0);
+  win->Update();
+#else
+  pixWrite("projection.png", pixc, IFF_PNG);
+#endif
+  pixDestroy(&pixc);
+}
+
+// Compute the distance of the box from the partition using curved projection
+// space. As DistanceOfBoxFromBox, except that the direction is taken from
+// the ColPartition and the median bounds of the ColPartition are used as
+// the to_box.
+int TextlineProjection::DistanceOfBoxFromPartition(const TBOX& box,
+                                                   const ColPartition& part,
+                                                   const DENORM* denorm,
+                                                   bool debug) const {
+  // Compute a partition box that uses the median top/bottom of the blobs
+  // within and median left/right for vertical.
+  TBOX part_box = part.bounding_box();
+  if (part.IsHorizontalType()) {
+    part_box.set_top(part.median_top());
+    part_box.set_bottom(part.median_bottom());
+  } else {
+    part_box.set_left(part.median_left());
+    part_box.set_right(part.median_right());
+  }
+  // Now use DistanceOfBoxFromBox to make the actual calculation.
+  return DistanceOfBoxFromBox(box, part_box, part.IsHorizontalType(),
+                              denorm, debug);
+}
+
+// Compute the distance from the from_box to the to_box using curved
+// projection space. Separation that involves a decrease in projection
+// density (moving from the from_box to the to_box) is weighted more heavily
+// than constant density, and an increase is weighted less.
+// If horizontal_textline is true, then curved space is used vertically,
+// as for a diacritic on the edge of a textline.
+// The projection uses original image coords, so denorm is used to get
+// back to the image coords from box/part space.
+// How the calculation works: Think of a diacritic near a textline.
+// Distance is measured from the far side of the from_box to the near side of
+// the to_box. Shown is the horizontal textline case.
+//          |------^-----|
+//          | from | box |
+//          |------|-----|
+//   perpendicular |
+//          <------v-------->|--------------------|
+//                  parallel |     to box         |
+//                           |--------------------|
+// Perpendicular distance uses "curved space" See VerticalDistance below.
+// Parallel distance is linear.
+// Result is perpendicular_gap + parallel_gap / kParaPerpDistRatio.
+int TextlineProjection::DistanceOfBoxFromBox(const TBOX& from_box,
+                                             const TBOX& to_box,
+                                             bool horizontal_textline,
+                                             const DENORM* denorm,
+                                             bool debug) const {
+  // The parallel_gap is the horizontal gap between a horizontal textline and
+  // the box. Analogous for vertical.
+  int parallel_gap = 0;
+  // start_pt is the box end of the line to be modified for curved space.
+  TPOINT start_pt;
+  // end_pt is the partition end of the line to be modified for curved space.
+  TPOINT end_pt;
+  if (horizontal_textline) {
+    parallel_gap = from_box.x_gap(to_box) + from_box.width();
+    start_pt.x = (from_box.left() + from_box.right()) / 2;
+    end_pt.x = start_pt.x;
+    if (from_box.top() - to_box.top() >= to_box.bottom() - from_box.bottom()) {
+      start_pt.y = from_box.top();
+      end_pt.y = MIN(to_box.top(), start_pt.y);
+    } else {
+      start_pt.y = from_box.bottom();
+      end_pt.y = MAX(to_box.bottom(), start_pt.y);
+    }
+  } else {
+    parallel_gap = from_box.y_gap(to_box) + from_box.height();
+    if (from_box.right() - to_box.right() >= to_box.left() - from_box.left()) {
+      start_pt.x = from_box.right();
+      end_pt.x = MIN(to_box.right(), start_pt.x);
+    } else {
+      start_pt.x = from_box.left();
+      end_pt.x = MAX(to_box.left(), start_pt.x);
+    }
+    start_pt.y = (from_box.bottom() + from_box.top()) / 2;
+    end_pt.y = start_pt.y;
+  }
+  // The perpendicular gap is the max vertical distance gap out of:
+  // top of from_box to to_box top and bottom of from_box to to_box bottom.
+  // This value is then modified for curved projection space.
+  // Analogous for vertical.
+  int perpendicular_gap = 0;
+  // If start_pt == end_pt, then the from_box lies entirely within the to_box
+  // (in the perpendicular direction), so we don't need to calculate the
+  // perpendicular_gap.
+  if (start_pt.x != end_pt.x || start_pt.y != end_pt.y) {
+    if (denorm != NULL) {
+      // Denormalize the start and end.
+      denorm->DenormTransform(start_pt, &start_pt);
+      denorm->DenormTransform(end_pt, &end_pt);
+    }
+    if (abs(start_pt.y - end_pt.y) >= abs(start_pt.x - end_pt.x)) {
+      perpendicular_gap = VerticalDistance(debug, start_pt.x, start_pt.y,
+                                           end_pt.y);
+    } else {
+      perpendicular_gap = HorizontalDistance(debug, start_pt.x, end_pt.x,
+                                             start_pt.y);
+    }
+  }
+  // The parallel_gap weighs less than the perpendicular_gap.
+  return perpendicular_gap + parallel_gap / kParaPerpDistRatio;
+}
+
+// Compute the distance between (x, y1) and (x, y2) using the rule that
+// a decrease in textline density is weighted more heavily than an increase.
+// The coordinates are in source image space, ie processed by any denorm
+// already, but not yet scaled by scale_factor_.
+// Going from the outside of a textline to the inside should measure much
+// less distance than going from the inside of a textline to the outside.
+// How it works:
+// An increase is cheap (getting closer to a textline).
+// Constant costs unity.
+// A decrease is expensive (getting further from a textline).
+// Pixels in projection map Counted distance
+//              2
+//              3              1/x
+//              3               1
+//              2               x
+//              5              1/x
+//              7              1/x
+// Total: 1 + x + 3/x where x = kWrongWayPenalty.
+int TextlineProjection::VerticalDistance(bool debug, int x,
+                                         int y1, int y2) const {
+  x = ImageXToProjectionX(x);
+  y1 = ImageYToProjectionY(y1);
+  y2 = ImageYToProjectionY(y2);
+  if (y1 == y2) return 0;
+  int wpl = pixGetWpl(pix_);
+  int step = y1 < y2 ? 1 : -1;
+  uinT32* data = pixGetData(pix_) + y1 * wpl;
+  wpl *= step;
+  int prev_pixel = GET_DATA_BYTE(data, x);
+  int distance = 0;
+  int right_way_steps = 0;
+  for (int y = y1; y != y2; y += step) {
+    data += wpl;
+    int pixel = GET_DATA_BYTE(data, x);
+    if (debug)
+      tprintf("At (%d,%d), pix = %d, prev=%d\n",
+              x, y + step, pixel, prev_pixel);
+    if (pixel < prev_pixel)
+      distance += kWrongWayPenalty;
+    else if (pixel > prev_pixel)
+      ++right_way_steps;
+    else
+      ++distance;
+    prev_pixel = pixel;
+  }
+  return distance * scale_factor_ +
+      right_way_steps * scale_factor_ / kWrongWayPenalty;
+}
+
+// Compute the distance between (x1, y) and (x2, y) using the rule that
+// a decrease in textline density is weighted more heavily than an increase.
+int TextlineProjection::HorizontalDistance(bool debug, int x1, int x2,
+                                           int y) const {
+  x1 = ImageXToProjectionX(x1);
+  x2 = ImageXToProjectionX(x2);
+  y = ImageYToProjectionY(y);
+  if (x1 == x2) return 0;
+  int wpl = pixGetWpl(pix_);
+  int step = x1 < x2 ? 1 : -1;
+  uinT32* data = pixGetData(pix_) + y * wpl;
+  int prev_pixel = GET_DATA_BYTE(data, x1);
+  int distance = 0;
+  int right_way_steps = 0;
+  for (int x = x1; x != x2; x += step) {
+    int pixel = GET_DATA_BYTE(data, x + step);
+    if (debug)
+      tprintf("At (%d,%d), pix = %d, prev=%d\n",
+              x + step, y, pixel, prev_pixel);
+    if (pixel < prev_pixel)
+      distance += kWrongWayPenalty;
+    else if (pixel > prev_pixel)
+      ++right_way_steps;
+    else
+      ++distance;
+    prev_pixel = pixel;
+  }
+  return distance * scale_factor_ +
+      right_way_steps * scale_factor_ / kWrongWayPenalty;
+}
+
+// Returns true if the blob appears to be outside of a textline.
+// Such blobs are potentially diacritics (even if large in Thai) and should
+// be kept away from initial textline finding.
+bool TextlineProjection::BoxOutOfHTextline(const TBOX& box,
+                                          const DENORM* denorm,
+                                          bool debug) const {
+  int grad1 = 0;
+  int grad2 = 0;
+  EvaluateBoxInternal(box, denorm, debug, &grad1, &grad2, NULL, NULL);
+  int worst_result = MIN(grad1, grad2);
+  int total_result = grad1 + grad2;
+  if (total_result >= 6) return false;  // Strongly in textline.
+  // Medium strength: if either gradient is negative, it is likely outside
+  // the body of the textline.
+  if (worst_result < 0)
+    return true;
+  return false;
+}
+
+// Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below,
+// but uses the median top/bottom for horizontal and median left/right for
+// vertical instead of the bounding box edges.
+// Evaluates for both horizontal and vertical and returns the best result,
+// with a positive value for horizontal and a negative value for vertical.
+int TextlineProjection::EvaluateColPartition(const ColPartition& part,
+                                             const DENORM* denorm,
+                                             bool debug) const {
+  if (part.IsSingleton())
+    return EvaluateBox(part.bounding_box(), denorm, debug);
+  // Test vertical orientation.
+  TBOX box = part.bounding_box();
+  // Use the partition median for left/right.
+  box.set_left(part.median_left());
+  box.set_right(part.median_right());
+  int vresult = EvaluateBox(box, denorm, debug);
+
+  // Test horizontal orientation.
+  box = part.bounding_box();
+  // Use the partition median for top/bottom.
+  box.set_top(part.median_top());
+  box.set_bottom(part.median_bottom());
+  int hresult = EvaluateBox(box, denorm, debug);
+  if (debug) {
+    tprintf("Partition hresult=%d, vresult=%d from:", hresult, vresult);
+    part.bounding_box().print();
+    part.Print();
+  }
+  return hresult >= -vresult ? hresult : vresult;
+}
+
+// Computes the mean projection gradients over the horizontal and vertical
+// edges of the box:
+//   -h-h-h-h-h-h
+//  |------------| mean=htop   -v|+v--------+v|-v
+//  |+h+h+h+h+h+h|             -v|+v        +v|-v
+//  |            |             -v|+v        +v|-v
+//  |    box     |             -v|+v  box   +v|-v
+//  |            |             -v|+v        +v|-v
+//  |+h+h+h+h+h+h|             -v|+v        +v|-v
+//  |------------| mean=hbot   -v|+v--------+v|-v
+//   -h-h-h-h-h-h
+//                           mean=vleft  mean=vright
+//
+// Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number
+// for a horizontal textline, a negative number for a vertical textline,
+// and near zero for undecided. Undecided is most likely non-text.
+// All the gradients are truncated to remain non-negative, since negative
+// horizontal gradients don't give any indication of being vertical and
+// vice versa.
+// Additional complexity: The coordinates have to be transformed to original
+// image coordinates with denorm (if not null), scaled to match the projection
+// pix, and THEN step out 2 pixels each way from the edge to compute the
+// gradient, and tries 3 positions, each measuring the gradient over a
+// 4-pixel spread: (+3/-1), (+2/-2), (+1/-3).  This complexity is handled by
+// several layers of helpers below.
+int TextlineProjection::EvaluateBox(const TBOX& box, const DENORM* denorm,
+                                    bool debug) const {
+  return EvaluateBoxInternal(box, denorm, debug, NULL, NULL, NULL, NULL);
+}
+
+// Internal version of EvaluateBox returns the unclipped gradients as well
+// as the result of EvaluateBox.
+// hgrad1 and hgrad2 are the gradients for the horizontal textline.
+int TextlineProjection::EvaluateBoxInternal(const TBOX& box,
+                                            const DENORM* denorm, bool debug,
+                                            int* hgrad1, int* hgrad2,
+                                            int* vgrad1, int* vgrad2) const {
+  int top_gradient = BestMeanGradientInRow(denorm, box.left(), box.right(),
+                                           box.top(), true);
+  int bottom_gradient = -BestMeanGradientInRow(denorm, box.left(), box.right(),
+                                               box.bottom(), false);
+  int left_gradient = BestMeanGradientInColumn(denorm, box.left(), box.bottom(),
+                                               box.top(), true);
+  int right_gradient = -BestMeanGradientInColumn(denorm, box.right(),
+                                                 box.bottom(), box.top(),
+                                                 false);
+  int top_clipped = MAX(top_gradient, 0);
+  int bottom_clipped = MAX(bottom_gradient, 0);
+  int left_clipped = MAX(left_gradient, 0);
+  int right_clipped = MAX(right_gradient, 0);
+  if (debug) {
+    tprintf("Gradients: top = %d, bottom = %d, left= %d, right= %d for box:",
+            top_gradient, bottom_gradient, left_gradient, right_gradient);
+    box.print();
+  }
+  int result = MAX(top_clipped, bottom_clipped) -
+      MAX(left_clipped, right_clipped);
+  if (hgrad1 != NULL && hgrad2 != NULL) {
+    *hgrad1 = top_gradient;
+    *hgrad2 = bottom_gradient;
+  }
+  if (vgrad1 != NULL && vgrad2 != NULL) {
+    *vgrad1 = left_gradient;
+    *vgrad2 = right_gradient;
+  }
+  return result;
+}
+
+// Helper returns the mean gradient value for the horizontal row at the given
+// y, (in the external coordinates) by subtracting the mean of the transformed
+// row 2 pixels above from the mean of the transformed row 2 pixels below.
+// This gives a positive value for a good top edge and negative for bottom.
+// Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.
+int TextlineProjection::BestMeanGradientInRow(const DENORM* denorm,
+                                              inT16 min_x, inT16 max_x, inT16 y,
+                                              bool best_is_max) const {
+  TPOINT start_pt(min_x, y);
+  TPOINT end_pt(max_x, y);
+  int upper = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt);
+  int lower = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt);
+  int best_gradient = lower - upper;
+  upper = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt);
+  lower = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt);
+  int gradient = lower - upper;
+  if ((gradient > best_gradient) == best_is_max)
+    best_gradient = gradient;
+  upper = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt);
+  lower = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt);
+  gradient = lower - upper;
+  if ((gradient > best_gradient) == best_is_max)
+    best_gradient = gradient;
+  return best_gradient;
+}
+
+// Helper returns the mean gradient value for the vertical column at the
+// given x, (in the external coordinates) by subtracting the mean of the
+// transformed column 2 pixels left from the mean of the transformed column
+// 2 pixels to the right.
+// This gives a positive value for a good left edge and negative for right.
+// Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.
+int TextlineProjection::BestMeanGradientInColumn(const DENORM* denorm, inT16 x,
+                                                 inT16 min_y, inT16 max_y,
+                                                 bool best_is_max) const {
+  TPOINT start_pt(x, min_y);
+  TPOINT end_pt(x, max_y);
+  int left = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt);
+  int right = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt);
+  int best_gradient = right - left;
+  left = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt);
+  right = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt);
+  int gradient = right - left;
+  if ((gradient > best_gradient) == best_is_max)
+    best_gradient = gradient;
+  left = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt);
+  right = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt);
+  gradient = right - left;
+  if ((gradient > best_gradient) == best_is_max)
+    best_gradient = gradient;
+  return best_gradient;
+}
+
+// Helper returns the mean pixel value over the line between the start_pt and
+// end_pt (inclusive), but shifted perpendicular to the line in the projection
+// image by offset pixels. For simplicity, it is assumed that the vector is
+// either nearly horizontal or nearly vertical. It works on skewed textlines!
+// The end points are in external coordinates, and will be denormalized with
+// the denorm if not NULL before further conversion to pix coordinates.
+// After all the conversions, the offset is added to the direction
+// perpendicular to the line direction. The offset is thus in projection image
+// coordinates, which allows the caller to get a guaranteed displacement
+// between pixels used to calculate gradients.
+int TextlineProjection::MeanPixelsInLineSegment(const DENORM* denorm,
+                                                int offset,
+                                                TPOINT start_pt,
+                                                TPOINT end_pt) const {
+  TransformToPixCoords(denorm, &start_pt);
+  TransformToPixCoords(denorm, &end_pt);
+  TruncateToImageBounds(&start_pt);
+  TruncateToImageBounds(&end_pt);
+  int wpl = pixGetWpl(pix_);
+  uinT32* data = pixGetData(pix_);
+  int total = 0;
+  int count = 0;
+  int x_delta = end_pt.x - start_pt.x;
+  int y_delta = end_pt.y - start_pt.y;
+  if (abs(x_delta) >= abs(y_delta)) {
+    if (x_delta == 0)
+      return 0;
+    // Horizontal line. Add the offset vertically.
+    int x_step = x_delta > 0 ? 1 : -1;
+    // Correct offset for rotation, keeping it anti-clockwise of the delta.
+    offset *= x_step;
+    start_pt.y += offset;
+    end_pt.y += offset;
+    TruncateToImageBounds(&start_pt);
+    TruncateToImageBounds(&end_pt);
+    x_delta = end_pt.x - start_pt.x;
+    y_delta = end_pt.y - start_pt.y;
+    count = x_delta * x_step + 1;
+    for (int x = start_pt.x; x != end_pt.x; x += x_step) {
+      int y = start_pt.y + DivRounded(y_delta * (x - start_pt.x), x_delta);
+      total += GET_DATA_BYTE(data + wpl * y, x);
+    }
+  } else {
+    // Vertical line. Add the offset horizontally.
+    int y_step = y_delta > 0 ? 1 : -1;
+    // Correct offset for rotation, keeping it anti-clockwise of the delta.
+    // Pix holds the image with y=0 at the top, so the offset is negated.
+    offset *= -y_step;
+    start_pt.x += offset;
+    end_pt.x += offset;
+    TruncateToImageBounds(&start_pt);
+    TruncateToImageBounds(&end_pt);
+    x_delta = end_pt.x - start_pt.x;
+    y_delta = end_pt.y - start_pt.y;
+    count = y_delta * y_step + 1;
+    for (int y = start_pt.y; y != end_pt.y; y += y_step) {
+      int x = start_pt.x + DivRounded(x_delta * (y - start_pt.y), y_delta);
+      total += GET_DATA_BYTE(data + wpl * y, x);
+    }
+  }
+  return DivRounded(total, count);
+}
+
+// Given an input pix, and a box, the sides of the box are shrunk inwards until
+// they bound any black pixels found within the original box.
+// The function converts between tesseract coords and the pix coords assuming
+// that this pix is full resolution equal in size to the original image.
+// Returns an empty box if there are no black pixels in the source box.
+static TBOX BoundsWithinBox(Pix* pix, const TBOX& box) {
+  int im_height = pixGetHeight(pix);
+  Box* input_box = boxCreate(box.left(), im_height - box.top(),
+                             box.width(), box.height());
+  Box* output_box = NULL;
+  pixClipBoxToForeground(pix, input_box, NULL, &output_box);
+  TBOX result_box;
+  if (output_box != NULL) {
+    l_int32 x, y, width, height;
+    boxGetGeometry(output_box, &x, &y, &width, &height);
+    result_box.set_left(x);
+    result_box.set_right(x + width);
+    result_box.set_top(im_height - y);
+    result_box.set_bottom(result_box.top() - height);
+    boxDestroy(&output_box);
+  }
+  boxDestroy(&input_box);
+  return result_box;
+}
+
+// Splits the given box in half at x_middle or y_middle according to split_on_x
+// and checks for nontext_map pixels in each half. Reduces the bbox so that it
+// still includes the middle point, but does not touch any fg pixels in
+// nontext_map. An empty box may be returned if there is no such box.
+static void TruncateBoxToMissNonText(int x_middle, int y_middle,
+                                     bool split_on_x, Pix* nontext_map,
+                                     TBOX* bbox) {
+  TBOX box1(*bbox);
+  TBOX box2(*bbox);
+  TBOX im_box;
+  if (split_on_x) {
+    box1.set_right(x_middle);
+    im_box = BoundsWithinBox(nontext_map, box1);
+    if (!im_box.null_box()) box1.set_left(im_box.right());
+    box2.set_left(x_middle);
+    im_box = BoundsWithinBox(nontext_map, box2);
+    if (!im_box.null_box()) box2.set_right(im_box.left());
+  } else {
+    box1.set_bottom(y_middle);
+    im_box = BoundsWithinBox(nontext_map, box1);
+    if (!im_box.null_box()) box1.set_top(im_box.bottom());
+    box2.set_top(y_middle);
+    im_box = BoundsWithinBox(nontext_map, box2);
+    if (!im_box.null_box()) box2.set_bottom(im_box.top());
+  }
+  box1 += box2;
+  *bbox = box1;
+}
+
+
+// Helper function to add 1 to a rectangle in source image coords to the
+// internal projection pix_.
+void TextlineProjection::IncrementRectangle8Bit(const TBOX& box) {
+  int scaled_left = ImageXToProjectionX(box.left());
+  int scaled_top = ImageYToProjectionY(box.top());
+  int scaled_right = ImageXToProjectionX(box.right());
+  int scaled_bottom = ImageYToProjectionY(box.bottom());
+  int wpl = pixGetWpl(pix_);
+  uinT32* data = pixGetData(pix_) + scaled_top * wpl;
+  for (int y = scaled_top; y <= scaled_bottom; ++y) {
+    for (int x = scaled_left; x <= scaled_right; ++x) {
+      int pixel = GET_DATA_BYTE(data, x);
+      if (pixel < 255)
+        SET_DATA_BYTE(data, x, pixel + 1);
+    }
+    data += wpl;
+  }
+}
+
+// Inserts a list of blobs into the projection.
+// Rotation is a multiple of 90 degrees to get from blob coords to
+// nontext_map coords, nontext_map_box is the bounds of the nontext_map.
+// Blobs are spread horizontally or vertically according to their internal
+// flags, but the spreading is truncated by set pixels in the nontext_map
+// and also by the horizontal rule line limits on the blobs.
+void TextlineProjection::ProjectBlobs(BLOBNBOX_LIST* blobs,
+                                      const FCOORD& rotation,
+                                      const TBOX& nontext_map_box,
+                                      Pix* nontext_map) {
+  BLOBNBOX_IT blob_it(blobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.data();
+    TBOX bbox = blob->bounding_box();
+    ICOORD middle((bbox.left() + bbox.right()) / 2,
+                  (bbox.bottom() + bbox.top()) / 2);
+    bool spreading_horizontally = PadBlobBox(blob, &bbox);
+    // Rotate to match the nontext_map.
+    bbox.rotate(rotation);
+    middle.rotate(rotation);
+    if (rotation.x() == 0.0f)
+      spreading_horizontally = !spreading_horizontally;
+    // Clip to the image before applying the increments.
+    bbox &= nontext_map_box;  // This is in-place box intersection.
+    // Check for image pixels before spreading.
+    TruncateBoxToMissNonText(middle.x(), middle.y(), spreading_horizontally,
+                             nontext_map, &bbox);
+    if (bbox.area() > 0) {
+      IncrementRectangle8Bit(bbox);
+    }
+  }
+}
+
+// Pads the bounding box of the given blob according to whether it is on
+// a horizontal or vertical text line, taking into account tab-stops near
+// the blob. Returns true if padding was in the horizontal direction.
+bool TextlineProjection::PadBlobBox(BLOBNBOX* blob, TBOX* bbox) {
+  // Determine which direction to spread.
+  // If text is well spaced out, it can be useful to pad perpendicular to
+  // the textline direction, so as to ensure diacritics get absorbed
+  // correctly, but if the text is tightly spaced, this will destroy the
+  // blank space between textlines in the projection map, and that would
+  // be very bad.
+  int pad_limit = scale_factor_ * kMinLineSpacingFactor;
+  int xpad = 0;
+  int ypad = 0;
+  bool padding_horizontally = false;
+  if (blob->UniquelyHorizontal()) {
+    xpad = bbox->height() * kOrientedPadFactor;
+    padding_horizontally = true;
+    // If the text appears to be very well spaced, pad the other direction by a
+    // single pixel in the projection profile space to help join diacritics to
+    // the textline.
+    if ((blob->neighbour(BND_ABOVE) == NULL ||
+        bbox->y_gap(blob->neighbour(BND_ABOVE)->bounding_box()) > pad_limit) &&
+        (blob->neighbour(BND_BELOW) == NULL ||
+        bbox->y_gap(blob->neighbour(BND_BELOW)->bounding_box()) > pad_limit)) {
+      ypad = scale_factor_;
+    }
+  } else if (blob->UniquelyVertical()) {
+    ypad = bbox->width() * kOrientedPadFactor;
+    if ((blob->neighbour(BND_LEFT) == NULL ||
+        bbox->x_gap(blob->neighbour(BND_LEFT)->bounding_box()) > pad_limit) &&
+        (blob->neighbour(BND_RIGHT) == NULL ||
+        bbox->x_gap(blob->neighbour(BND_RIGHT)->bounding_box()) > pad_limit)) {
+      xpad = scale_factor_;
+    }
+  } else {
+    if ((blob->neighbour(BND_ABOVE) != NULL &&
+         blob->neighbour(BND_ABOVE)->neighbour(BND_BELOW) == blob) ||
+        (blob->neighbour(BND_BELOW) != NULL &&
+            blob->neighbour(BND_BELOW)->neighbour(BND_ABOVE) == blob)) {
+      ypad = bbox->width() * kDefaultPadFactor;
+    }
+    if ((blob->neighbour(BND_RIGHT) != NULL &&
+         blob->neighbour(BND_RIGHT)->neighbour(BND_LEFT) == blob) ||
+        (blob->neighbour(BND_LEFT) != NULL &&
+            blob->neighbour(BND_LEFT)->neighbour(BND_RIGHT) == blob)) {
+      xpad = bbox->height() * kDefaultPadFactor;
+      padding_horizontally = true;
+    }
+  }
+  bbox->pad(xpad, ypad);
+  pad_limit = scale_factor_ * kMaxTabStopOverrun;
+  // Now shrink horizontally to avoid stepping more than pad_limit over a
+  // tab-stop.
+  if (bbox->left() < blob->left_rule() - pad_limit) {
+    bbox->set_left(blob->left_rule() - pad_limit);
+  }
+  if (bbox->right() > blob->right_rule() + pad_limit) {
+    bbox->set_right(blob->right_rule() + pad_limit);
+  }
+  return padding_horizontally;
+}
+
+// Helper denormalizes the TPOINT with the denorm if not NULL, then
+// converts to pix_ coordinates.
+void TextlineProjection::TransformToPixCoords(const DENORM* denorm,
+                                              TPOINT* pt) const {
+  if (denorm != NULL) {
+    // Denormalize the point.
+    denorm->DenormTransform(*pt, pt);
+  }
+  pt->x = ImageXToProjectionX(pt->x);
+  pt->y = ImageYToProjectionY(pt->y);
+}
+
+// Helper truncates the TPOINT to be within the pix_.
+void TextlineProjection::TruncateToImageBounds(TPOINT* pt) const {
+  pt->x = ClipToRange<int>(pt->x, 0, pixGetWidth(pix_) - 1);
+  pt->y = ClipToRange<int>(pt->y, 0, pixGetHeight(pix_) - 1);
+}
+
+// Transform tesseract image coordinates to coordinates used in the projection.
+int TextlineProjection::ImageXToProjectionX(int x) const {
+  x = ClipToRange((x - x_origin_) / scale_factor_, 0, pixGetWidth(pix_) - 1);
+  return x;
+}
+int TextlineProjection::ImageYToProjectionY(int y) const {
+  y = ClipToRange((y_origin_ - y) / scale_factor_, 0, pixGetHeight(pix_) - 1);
+  return y;
+}
+
+}  // namespace tesseract.
--- a/textord/textlineprojection.h
+++ b/textord/textlineprojection.h
@ -0,0 +1,206 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_
+#define TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_
+
+#include "blobgrid.h"      // For BlobGrid
+
+class DENORM;
+struct Pix;
+struct TPOINT;
+
+namespace tesseract {
+
+class ColPartition;
+
+// Simple class to encapsulate the computation of an image representing
+// local textline density, and function(s) to make use of it.
+// The underlying principle is that if you smear connected components
+// horizontally (vertically for components on a vertically written textline)
+// and count the number of smeared components in an image, then the resulting
+// image shows the density of the textlines at each image position.
+class TextlineProjection {
+ public:
+  // The down-scaling factor is computed to obtain a projection resolution
+  // of about 100 dpi, whatever the input.
+  explicit TextlineProjection(int resolution);
+  ~TextlineProjection();
+
+  // Build the projection profile given the input_block containing lists of
+  // blobs, a rotation to convert to image coords,
+  // and a full-resolution nontext_map, marking out areas to avoid.
+  // During construction, we have the following assumptions:
+  // The rotation is a multiple of 90 degrees, ie no deskew yet.
+  // The blobs have had their left and right rules set to also limit
+  // the range of projection.
+  void ConstructProjection(TO_BLOCK* input_block,
+                           const FCOORD& rotation, Pix* nontext_map);
+
+  // Display the blobs in the window colored according to textline quality.
+  void PlotGradedBlobs(BLOBNBOX_LIST* blobs, ScrollView* win);
+
+  // Moves blobs that look like they don't sit well on a textline from the
+  // input blobs list to the output small_blobs list.
+  // This gets them away from initial textline finding to stop diacritics
+  // from forming incorrect textlines. (Introduced mainly to fix Thai.)
+  void MoveNonTextlineBlobs(BLOBNBOX_LIST* blobs,
+                            BLOBNBOX_LIST* small_blobs) const;
+
+  // Create a window and display the projection in it.
+  void DisplayProjection() const;
+
+  // Compute the distance of the box from the partition using curved projection
+  // space. As DistanceOfBoxFromBox, except that the direction is taken from
+  // the ColPartition and the median bounds of the ColPartition are used as
+  // the to_box.
+  int DistanceOfBoxFromPartition(const TBOX& box, const ColPartition& part,
+                                 const DENORM* denorm, bool debug) const;
+
+  // Compute the distance from the from_box to the to_box using curved
+  // projection space. Separation that involves a decrease in projection
+  // density (moving from the from_box to the to_box) is weighted more heavily
+  // than constant density, and an increase is weighted less.
+  // If horizontal_textline is true, then curved space is used vertically,
+  // as for a diacritic on the edge of a textline.
+  // The projection uses original image coords, so denorm is used to get
+  // back to the image coords from box/part space.
+  int DistanceOfBoxFromBox(const TBOX& from_box, const TBOX& to_box,
+                           bool horizontal_textline,
+                           const DENORM* denorm, bool debug) const;
+
+  // Compute the distance between (x, y1) and (x, y2) using the rule that
+  // a decrease in textline density is weighted more heavily than an increase.
+  // The coordinates are in source image space, ie processed by any denorm
+  // already, but not yet scaled by scale_factor_.
+  // Going from the outside of a textline to the inside should measure much
+  // less distance than going from the inside of a textline to the outside.
+  int VerticalDistance(bool debug, int x, int y1, int y2) const;
+
+  // Compute the distance between (x1, y) and (x2, y) using the rule that
+  // a decrease in textline density is weighted more heavily than an increase.
+  int HorizontalDistance(bool debug, int x1, int x2, int y) const;
+
+  // Returns true if the blob appears to be outside of a horizontal textline.
+  // Such blobs are potentially diacritics (even if large in Thai) and should
+  // be kept away from initial textline finding.
+  bool BoxOutOfHTextline(const TBOX& box, const DENORM* denorm,
+                        bool debug) const;
+
+  // Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below,
+  // but uses the median top/bottom for horizontal and median left/right for
+  // vertical instead of the bounding box edges.
+  // Evaluates for both horizontal and vertical and returns the best result,
+  // with a positive value for horizontal and a negative value for vertical.
+  int EvaluateColPartition(const ColPartition& part, const DENORM* denorm,
+                           bool debug) const;
+
+  // Computes the mean projection gradients over the horizontal and vertical
+  // edges of the box:
+  //   -h-h-h-h-h-h
+  //  |------------| mean=htop   -v|+v--------+v|-v
+  //  |+h+h+h+h+h+h|             -v|+v        +v|-v
+  //  |            |             -v|+v        +v|-v
+  //  |    box     |             -v|+v  box   +v|-v
+  //  |            |             -v|+v        +v|-v
+  //  |+h+h+h+h+h+h|             -v|+v        +v|-v
+  //  |------------| mean=hbot   -v|+v--------+v|-v
+  //   -h-h-h-h-h-h
+  //                           mean=vleft  mean=vright
+  //
+  // Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number
+  // for a horizontal textline, a negative number for a vertical textline,
+  // and near zero for undecided. Undecided is most likely non-text.
+  int EvaluateBox(const TBOX& box, const DENORM* denorm, bool debug) const;
+
+ private:
+  // Internal version of EvaluateBox returns the unclipped gradients as well
+  // as the result of EvaluateBox.
+  // hgrad1 and hgrad2 are the gradients for the horizontal textline.
+  int EvaluateBoxInternal(const TBOX& box, const DENORM* denorm, bool debug,
+                          int* hgrad1, int* hgrad2,
+                          int* vgrad1, int* vgrad2) const;
+
+  // Helper returns the mean gradient value for the horizontal row at the given
+  // y, (in the external coordinates) by subtracting the mean of the transformed
+  // row 2 pixels above from the mean of the transformed row 2 pixels below.
+  // This gives a positive value for a good top edge and negative for bottom.
+  // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.
+  int BestMeanGradientInRow(const DENORM* denorm, inT16 min_x, inT16 max_x,
+                            inT16 y, bool best_is_max) const;
+
+  // Helper returns the mean gradient value for the vertical column at the
+  // given x, (in the external coordinates) by subtracting the mean of the
+  // transformed column 2 pixels left from the mean of the transformed column
+  // 2 pixels to the right.
+  // This gives a positive value for a good left edge and negative for right.
+  // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.
+  int BestMeanGradientInColumn(const DENORM* denorm, inT16 x, inT16 min_y,
+                               inT16 max_y, bool best_is_max) const;
+
+  // Helper returns the mean pixel value over the line between the start_pt and
+  // end_pt (inclusive), but shifted perpendicular to the line in the projection
+  // image by offset pixels. For simplicity, it is assumed that the vector is
+  // either nearly horizontal or nearly vertical. It works on skewed textlines!
+  // The end points are in external coordinates, and will be denormalized with
+  // the denorm if not NULL before further conversion to pix coordinates.
+  // After all the conversions, the offset is added to the direction
+  // perpendicular to the line direction. The offset is thus in projection image
+  // coordinates, which allows the caller to get a guaranteed displacement
+  // between pixels used to calculate gradients.
+  int MeanPixelsInLineSegment(const DENORM* denorm, int offset,
+                              TPOINT start_pt, TPOINT end_pt) const;
+
+  // Helper function to add 1 to a rectangle in source image coords to the
+  // internal projection pix_.
+  void IncrementRectangle8Bit(const TBOX& box);
+  // Inserts a list of blobs into the projection.
+  // Rotation is a multiple of 90 degrees to get from blob coords to
+  // nontext_map coords, image_box is the bounds of the nontext_map.
+  // Blobs are spread horizontally or vertically according to their internal
+  // flags, but the spreading is truncated by set pixels in the nontext_map
+  // and also by the horizontal rule line limits on the blobs.
+  void ProjectBlobs(BLOBNBOX_LIST* blobs, const FCOORD& rotation,
+                    const TBOX& image_box, Pix* nontext_map);
+  // Pads the bounding box of the given blob according to whether it is on
+  // a horizontal or vertical text line, taking into account tab-stops near
+  // the blob. Returns true if padding was in the horizontal direction.
+  bool PadBlobBox(BLOBNBOX* blob, TBOX* bbox);
+
+  // Helper denormalizes the TPOINT with the denorm if not NULL, then
+  // converts to pix_ coordinates.
+  void TransformToPixCoords(const DENORM* denorm, TPOINT* pt) const;
+
+  // Helper truncates the TPOINT to be within the pix_.
+  void TruncateToImageBounds(TPOINT* pt) const;
+
+  // Transform tesseract coordinates to coordinates used in the pix.
+  int ImageXToProjectionX(int x) const;
+  int ImageYToProjectionY(int y) const;
+
+  // The down-sampling scale factor used in building the image.
+  int scale_factor_;
+  // The blob coordinates of the top-left (origin of the pix_) in tesseract
+  // coordinates. Used to transform the bottom-up tesseract coordinates to
+  // the top-down coordinates of the pix.
+  int x_origin_;
+  int y_origin_;
+  // The image of horizontally smeared blob boxes summed to provide a
+  // textline density map. As with a horizontal projection, the map has
+  // dips in the gaps between textlines.
+  Pix* pix_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_
--- a/textord/textord.cpp
+++ b/textord/textord.cpp
@ -27,7 +27,7 @@
 namespace tesseract {

 Textord::Textord(CCStruct* ccstruct)
-    : ccstruct_(ccstruct),
+    : ccstruct_(ccstruct), use_cjk_fp_model_(false),
      // makerow.cpp ///////////////////////////////////////////
      BOOL_MEMBER(textord_single_height_mode, false,
                  "Script has no xheight, so use a single mode",
@ -317,6 +317,13 @@ void Textord::TextordPage(PageSegMode pageseg_mode,
                     to_block->get_rows(), to_block->block->row_list());
  }
  cleanup_blocks(blocks);  // Remove empties.
+
+  // Compute the margins for each row in the block, to be used later for
+  // paragraph detection.
+  BLOCK_IT b_it(blocks);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    b_it.data()->compute_row_margins();
+  }
 #ifndef GRAPHICS_DISABLED
  close_to_win();
 #endif
--- a/textord/textord.h
+++ b/textord/textord.h
@ -50,6 +50,13 @@ class Textord {
  // than one, clean up and leave only the best.
  void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES* page_res);

+  bool use_cjk_fp_model() const {
+    return use_cjk_fp_model_;
+  }
+  void set_use_cjk_fp_model(bool flag) {
+    use_cjk_fp_model_ = flag;
+  }
+
  // tospace.cpp ///////////////////////////////////////////
  void to_spacing(
      ICOORD page_tr,        //topright of page
@ -64,6 +71,7 @@ class Textord {
  // tordmain.cpp ///////////////////////////////////////////
  void find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
  void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on);
+
 private:
  // For underlying memory management and other utilities.
  CCStruct* ccstruct_;
@ -71,6 +79,8 @@ class Textord {
  // The size of the input image.
  ICOORD page_tr_;

+  bool use_cjk_fp_model_;
+
  // makerow.cpp ///////////////////////////////////////////
  // Make the textlines inside each block.
  void MakeRows(PageSegMode pageseg_mode, const FCOORD& skew,
--- a/textord/topitch.cpp
+++ b/textord/topitch.cpp
@ -112,9 +112,11 @@ void compute_fixed_pitch(ICOORD page_tr,              // top right
  }

  block_index = 1;
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
-  block_it.forward ()) {
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
+       block_it.forward()) {
    block = block_it.data ();
+    POLY_BLOCK* pb = block->block->poly_block();
+    if (pb != NULL && !pb->IsText()) continue;  // Non-text doesn't exist!
    row_it.set_to_list (block->get_rows ());
    row_index = 1;
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
@ -166,9 +168,11 @@ void fix_row_pitch(TO_ROW *bad_row,        // row to fix
    block_stats.set_range (0, maxwidth);
    like_stats.set_range (0, maxwidth);
    block_index = 1;
-    for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
-    block_it.forward ()) {
-      block = block_it.data ();
+    for (block_it.mark_cycle_pt(); !block_it.cycled_list();
+         block_it.forward()) {
+      block = block_it.data();
+      POLY_BLOCK* pb = block->block->poly_block();
+      if (pb != NULL && !pb->IsText()) continue;  // Non text doesn't exist!
      row_index = 1;
      row_it.set_to_list (block->get_rows ());
      for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
@ -568,7 +572,8 @@ BOOL8 try_rows_fixed(                    //find line stats
    row = row_it.data ();
    ASSERT_HOST (row->xheight > 0);
    maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
-    if (row->fixed_pitch > 0 && fixed_pitch_row (row, block_index)) {
+    if (row->fixed_pitch > 0 &&
+        fixed_pitch_row(row, block->block, block_index)) {
      if (row->fixed_pitch == 0) {
        lower = row->pr_nonsp;
        upper = row->pr_space;
@ -971,9 +976,9 @@ BOOL8 find_row_pitch(                    //find lines
 * The larger threshold is the word gap threshold.
 **********************************************************************/

-BOOL8 fixed_pitch_row(                   //find lines
-                      TO_ROW *row,       //row to do
-                      inT32 block_index  //block_number
+BOOL8 fixed_pitch_row(TO_ROW *row,       // row to do
+                      BLOCK* block,
+                      inT32 block_index  // block_number
                     ) {
  const char *res_string;        //pitch result
  inT16 mid_cuts;                //no of cheap cuts
@ -984,7 +989,8 @@ BOOL8 fixed_pitch_row(                   //find lines
  non_space = row->fp_nonsp;
  if (non_space > row->fixed_pitch)
    non_space = row->fixed_pitch;
-  if (textord_all_prop) {
+  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
+  if (textord_all_prop || (pb != NULL && !pb->IsText())) {
    // Set the decision to definitely proportional.
    pitch_sd = textord_words_def_prop * row->fixed_pitch;
    row->pitch_decision = PITCH_DEF_PROP;
@ -1755,6 +1761,10 @@ void print_pitch_sd(                        //find fp cells
 **********************************************************************/
 void find_repeated_chars(TO_BLOCK *block,       // Block to search.
                         BOOL8 testing_on) {    // Debug mode.
+  POLY_BLOCK* pb = block->block->poly_block();
+  if (pb != NULL && !pb->IsText())
+    return;  // Don't find repeated chars in non-text blocks.
+
  TO_ROW *row;
  BLOBNBOX_IT box_it;
  BLOBNBOX_IT search_it;         // forward search
--- a/textord/topitch.h
+++ b/textord/topitch.h
@ -107,6 +107,7 @@ BOOL8 find_row_pitch(                    //find lines
                    );
 BOOL8 fixed_pitch_row(                   //find lines
                      TO_ROW *row,       //row to do
+                      BLOCK* block,
                      inT32 block_index  //block_number
                     );
 BOOL8 count_pitch_stats(                       //find lines
--- a/textord/wordseg.cpp
+++ b/textord/wordseg.cpp
@ -29,6 +29,7 @@
 #include          "pitsync1.h"
 #include          "tovars.h"
 #include          "topitch.h"
+#include          "cjkpitch.h"
 #include          "textord.h"
 #include          "fpchop.h"
 #include          "wordseg.h"
@ -101,7 +102,6 @@ void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
 *
 * Arrange the blobs into words.
 */
-
 void make_words(tesseract::Textord *textord,
                ICOORD page_tr,                // top right
                float gradient,                // page skew
@ -110,8 +110,12 @@ void make_words(tesseract::Textord *textord,
  TO_BLOCK_IT block_it;          // iterator
  TO_BLOCK *block;               // current block

-  compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
-                      !(BOOL8) textord_test_landscape);
+  if (textord->use_cjk_fp_model()) {
+    compute_fixed_pitch_cjk(page_tr, port_blocks);
+  } else {
+    compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
+                        !(BOOL8) textord_test_landscape);
+  }
  textord->to_spacing(page_tr, port_blocks);
  block_it.set_to_list(port_blocks);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
@ -525,24 +529,26 @@ void make_real_words(
    row = row_it.data ();
    if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
      real_row = make_rep_words (row, block);
-    }
-    else if (!row->blob_list()->empty()) {
+    } else if (!row->blob_list()->empty()) {
      // In a fixed pitch document, some lines may be detected as fixed pitch
      // while others don't, and will go through different path.
      // For non-space delimited language like CJK, fixed pitch chop always
      // leave the entire line as one word.  We can force consistent chopping
      // with force_make_prop_words flag.
+      POLY_BLOCK* pb = block->block->poly_block();
      if (textord_chopper_test) {
        real_row = textord->make_blob_words (row, rotation);
      } else if (textord_force_make_prop_words ||
-          row->pitch_decision == PITCH_DEF_PROP ||
-          row->pitch_decision == PITCH_CORR_PROP) {
+                 (pb != NULL && !pb->IsText()) ||
+                 row->pitch_decision == PITCH_DEF_PROP ||
+                 row->pitch_decision == PITCH_CORR_PROP) {
        real_row = textord->make_prop_words (row, rotation);
      } else if (row->pitch_decision == PITCH_DEF_FIXED ||
                 row->pitch_decision == PITCH_CORR_FIXED) {
        real_row = fixed_pitch_words (row, rotation);
-      } else
+      } else {
        ASSERT_HOST(FALSE);
+      }
    }
    if (real_row != NULL) {
                                 //put row in block
--- a/textord/workingpartset.cpp
+++ b/textord/workingpartset.cpp
@ -108,9 +108,11 @@ void WorkingPartSet::MakeBlocks(const ICOORD& bleft, const ICOORD& tright,
        ColPartition* next_block_part = part_it_.data();
        const TBOX& part_box = part->bounding_box();
        const TBOX& next_box = next_block_part->bounding_box();
+
        // In addition to the same type, the next box must not be above the
        // current box, nor (if image) too far below.
-        if (next_block_part->type() == part->type() &&
+        PolyBlockType type = part->type(), next_type = next_block_part->type();
+        if (ColPartition::TypesSimilar(type, next_type) &&
            next_box.bottom() <= part_box.top() &&
            (text_block ||
             part_box.bottom() - next_box.top() < part_box.height()))
@ -139,4 +141,3 @@ void WorkingPartSet::MakeBlocks(const ICOORD& bleft, const ICOORD& tright,
 }

 }  // namespace tesseract.
-