Moved ResultIterator/PageIterator to ccmain

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@645 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-01-18 14:41:36 +08:00 · 2012-02-02 02:47:59 +00:00 · 2012-02-02 02:47:59 +00:00 · ef786ad29b
commit ef786ad29b
parent 8225f5b846
6 changed files with 1104 additions and 484 deletions
--- a/api/resultiterator.cpp
+++ b/api/resultiterator.cpp
@ -1,278 +0,0 @@
-///////////////////////////////////////////////////////////////////////
-// File:        resultiterator.cpp
-// Description: Iterator for tesseract results that avoids using tesseract
-//              internal data structures
-// Author:      Ray Smith
-// Created:     Fri Feb 26 14:32:09 PST 2010
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-///////////////////////////////////////////////////////////////////////
-
-#include "resultiterator.h"
-#include "allheaders.h"
-#include "pageres.h"
-#include "tesseractclass.h"
-
-namespace tesseract {
-
-ResultIterator::ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                               int scale, int scaled_yres,
-                               int rect_left, int rect_top,
-                               int rect_width, int rect_height)
-  : PageIterator(page_res, tesseract, scale, scaled_yres,
-    rect_left, rect_top, rect_width, rect_height) {
-}
-
-ResultIterator::~ResultIterator() {
-}
-
-// Returns the null terminated UTF-8 encoded text string for the current
-// object at the given level. Use delete [] to free after use.
-char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
-  if (it_->word() == NULL) return NULL;  // Already at the end!
-  STRING text;
-  PAGE_RES_IT res_it(*it_);
-  WERD_CHOICE* best_choice = res_it.word()->best_choice;
-  ASSERT_HOST(best_choice != NULL);
-  switch (level) {
-    case RIL_BLOCK:
-    case RIL_PARA:
-      do {
-        best_choice = res_it.word()->best_choice;
-        ASSERT_HOST(best_choice != NULL);
-        text += best_choice->unichar_string();
-        text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
-        res_it.forward();
-      } while (res_it.block() == res_it.prev_block());
-      break;
-    case RIL_TEXTLINE:
-      do {
-        best_choice = res_it.word()->best_choice;
-        ASSERT_HOST(best_choice != NULL);
-        text += best_choice->unichar_string();
-        text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
-         res_it.forward();
-      } while (res_it.row() == res_it.prev_row());
-      break;
-    case RIL_WORD:
-      text = best_choice->unichar_string();
-      break;
-    case RIL_SYMBOL:
-      text = tesseract_->unicharset.id_to_unichar(
-          best_choice->unichar_id(blob_index_));
-  }
-  int length = text.length() + 1;
-  char* result = new char[length];
-  strncpy(result, text.string(), length);
-  return result;
-}
-
-// Returns the mean confidence of the current object at the given level.
-// The number should be interpreted as a percent probability. (0.0f-100.0f)
-float ResultIterator::Confidence(PageIteratorLevel level) const {
-  if (it_->word() == NULL) return 0.0f;  // Already at the end!
-  float mean_certainty = 0.0f;
-  int certainty_count = 0;
-  PAGE_RES_IT res_it(*it_);
-  WERD_CHOICE* best_choice = res_it.word()->best_choice;
-  ASSERT_HOST(best_choice != NULL);
-  switch (level) {
-    case RIL_BLOCK:
-    case RIL_PARA:
-      do {
-        best_choice = res_it.word()->best_choice;
-        ASSERT_HOST(best_choice != NULL);
-        mean_certainty += best_choice->certainty();
-        ++certainty_count;
-        res_it.forward();
-      } while (res_it.block() == res_it.prev_block());
-      break;
-    case RIL_TEXTLINE:
-      do {
-        best_choice = res_it.word()->best_choice;
-        ASSERT_HOST(best_choice != NULL);
-        mean_certainty += best_choice->certainty();
-        ++certainty_count;
-        res_it.forward();
-      } while (res_it.row() == res_it.prev_row());
-      break;
-    case RIL_WORD:
-      mean_certainty += best_choice->certainty();
-     ++certainty_count;
-      break;
-    case RIL_SYMBOL:
-      BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
-      if (choices != NULL) {
-        BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
-        for (int blob = 0; blob < blob_index_; ++blob)
-          blob_choices_it.forward();
-        BLOB_CHOICE_IT choice_it(blob_choices_it.data());
-        for (choice_it.mark_cycle_pt();
-             !choice_it.cycled_list();
-             choice_it.forward()) {
-          if (choice_it.data()->unichar_id() ==
-              best_choice->unichar_id(blob_index_))
-            break;
-        }
-        mean_certainty += choice_it.data()->certainty();
-      } else {
-        mean_certainty += best_choice->certainty();
-      }
-      ++certainty_count;
-  }
-  if (certainty_count > 0) {
-    mean_certainty /= certainty_count;
-    float confidence = 100 + 5 * mean_certainty;
-    if (confidence < 0.0f) confidence = 0.0f;
-    if (confidence > 100.0f) confidence = 100.0f;
-    return confidence;
-  }
-  return 0.0f;
-}
-
-// Returns the font attributes of the current word. If iterating at a higher
-// level object than words, eg textlines, then this will return the
-// attributes of the first word in that textline.
-// The actual return value is a string representing a font name. It points
-// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
-// the iterator itself, ie rendered invalid by various members of
-// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
-// Pointsize is returned in printers points (1/72 inch.)
-const char* ResultIterator::WordFontAttributes(bool* is_bold,
-                                               bool* is_italic,
-                                               bool* is_underlined,
-                                               bool* is_monospace,
-                                               bool* is_serif,
-                                               bool* is_smallcaps,
-                                               int* pointsize,
-                                               int* font_id) const {
-  if (it_->word() == NULL) return NULL;  // Already at the end!
-  *font_id = it_->word()->fontinfo_id;
-  if (*font_id < 0) return NULL;  // No font available.
-  const UnicityTable<FontInfo> &font_table = tesseract_->get_fontinfo_table();
-  FontInfo font_info = font_table.get(*font_id);
-  *is_bold = font_info.is_bold();
-  *is_italic = font_info.is_italic();
-  *is_underlined = false;  // TODO(rays) fix this!
-  *is_monospace = font_info.is_fixed_pitch();
-  *is_serif = font_info.is_serif();
-  *is_smallcaps = it_->word()->small_caps;
-  // The font size is calculated from a multiple of the x-height
-  // that came from the block.
-  float row_height = it_->row()->row->x_height() *
-      it_->block()->block->cell_over_xheight();
-  // Convert from pixels to printers points.
-  *pointsize = scaled_yres_ > 0
-    ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
-    : 0;
-
-  return font_info.name;
-}
-
-// Returns true if the current word was found in a dictionary.
-bool ResultIterator::WordIsFromDictionary() const {
-  if (it_->word() == NULL) return false;  // Already at the end!
-  int permuter = it_->word()->best_choice->permuter();
-  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
-         permuter == USER_DAWG_PERM;
-}
-
-// Returns true if the current word is numeric.
-bool ResultIterator::WordIsNumeric() const {
-  if (it_->word() == NULL) return false;  // Already at the end!
-  int permuter = it_->word()->best_choice->permuter();
-  return permuter == NUMBER_PERM;
-}
-
-// Returns true if the current symbol is a superscript.
-// If iterating at a higher level object than symbols, eg words, then
-// this will return the attributes of the first symbol in that word.
-bool ResultIterator::SymbolIsSuperscript() const {
-  if (cblob_it_ == NULL && it_->word() != NULL)
-    return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
-  return false;
-}
-
-// Returns true if the current symbol is a subscript.
-// If iterating at a higher level object than symbols, eg words, then
-// this will return the attributes of the first symbol in that word.
-bool ResultIterator::SymbolIsSubscript() const {
-  if (cblob_it_ == NULL && it_->word() != NULL)
-    return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
-  return false;
-}
-
-// Returns true if the current symbol is a dropcap.
-// If iterating at a higher level object than symbols, eg words, then
-// this will return the attributes of the first symbol in that word.
-bool ResultIterator::SymbolIsDropcap() const {
-  if (cblob_it_ == NULL && it_->word() != NULL)
-    return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
-  return false;
-}
-
-ChoiceIterator::ChoiceIterator(const ResultIterator& result_it) {
-  ASSERT_HOST(result_it.it_->word() != NULL);
-  tesseract_ = result_it.tesseract_;
-  PAGE_RES_IT res_it(*result_it.it_);
-  WERD_CHOICE* best_choice = res_it.word()->best_choice;
-  BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
-  if (choices != NULL) {
-    BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
-    for (int blob = 0; blob < result_it.blob_index_; ++blob)
-      blob_choices_it.forward();
-    choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
-    choice_it_->mark_cycle_pt();
-  } else {
-    choice_it_ = NULL;
-  }
-}
-
-ChoiceIterator::~ChoiceIterator() {
-  delete choice_it_;
-}
-
-// Moves to the next choice for the symbol and returns false if there
-// are none left.
-bool ChoiceIterator::Next() {
-  if (choice_it_ == NULL)
-    return false;
-  choice_it_->forward();
-  return !choice_it_->cycled_list();
-}
-
-// Returns the null terminated UTF-8 encoded text string for the current
-// choice. Use delete [] to free after use.
-const char* ChoiceIterator::GetUTF8Text() const {
-  if (choice_it_ == NULL)
-    return NULL;
-  UNICHAR_ID id = choice_it_->data()->unichar_id();
-  if (id < 0 || id >= tesseract_->unicharset.size() ||
-      id == INVALID_UNICHAR_ID)
-    return NULL;
-  return tesseract_->unicharset.id_to_unichar(id);
-}
-
-// Returns the confidence of the current choice.
-// The number should be interpreted as a percent probability. (0.0f-100.0f)
-float ChoiceIterator::Confidence() const {
-  if (choice_it_ == NULL)
-    return 0.0f;
-  float confidence = 100 + 5 * choice_it_->data()->certainty();
-  if (confidence < 0.0f) confidence = 0.0f;
-  if (confidence > 100.0f) confidence = 100.0f;
-  return confidence;
-}
-
-
-}  // namespace tesseract.
--- a/api/resultiterator.h
+++ b/api/resultiterator.h
@ -1,160 +0,0 @@
-///////////////////////////////////////////////////////////////////////
-// File:        resultiterator.h
-// Description: Iterator for tesseract results that avoids using tesseract
-//              internal data structures.
-// Author:      Ray Smith
-// Created:     Fri Feb 26 11:01:06 PST 2010
-//
-// (C) Copyright 2010, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-///////////////////////////////////////////////////////////////////////
-
-#ifndef TESSERACT_API_RESULTITERATOR_H__
-#define TESSERACT_API_RESULTITERATOR_H__
-
-#include "pageiterator.h"
-
-class BLOB_CHOICE_IT;
-
-namespace tesseract {
-
-class Tesseract;
-
-// Class to iterate over tesseract results, providing access to all levels
-// of the page hierarchy, without including any tesseract headers or having
-// to handle any tesseract structures.
-// WARNING! This class points to data held within the TessBaseAPI class, and
-// therefore can only be used while the TessBaseAPI class still exists and
-// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
-// DetectOS, or anything else that changes the internal PAGE_RES.
-// See apitypes.h for the definition of PageIteratorLevel.
-// See also base class PageIterator, which contains the bulk of the interface.
-// ResultIterator adds text-specific methods for access to OCR output.
-
-class ResultIterator : public PageIterator {
-  friend class ChoiceIterator;
- public:
-  // page_res and tesseract come directly from the BaseAPI.
-  // The rectangle parameters are copied indirectly from the Thresholder,
-  // via the BaseAPI. They represent the coordinates of some rectangle in an
-  // original image (in top-left-origin coordinates) and therefore the top-left
-  // needs to be added to any output boxes in order to specify coordinates
-  // in the original image. See TessBaseAPI::SetRectangle.
-  // The scale and scaled_yres are in case the Thresholder scaled the image
-  // rectangle prior to thresholding. Any coordinates in tesseract's image
-  // must be divided by scale before adding (rect_left, rect_top).
-  // The scaled_yres indicates the effective resolution of the binary image
-  // that tesseract has been given by the Thresholder.
-  // After the constructor, Begin has already been called.
-  ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                 int scale, int scaled_yres,
-                 int rect_left, int rect_top,
-                 int rect_width, int rect_height);
-  virtual ~ResultIterator();
-
-  // ResultIterators may be copied! This makes it possible to iterate over
-  // all the objects at a lower level, while maintaining an iterator to
-  // objects at a higher level. These constructors DO NOT CALL Begin, so
-  // iterations will continue from the location of src.
-  // TODO: For now the copy constructor and operator= only need the base class
-  // versions, but if new data members are added, don't forget to add them!
-
-  // ============= Moving around within the page ============.
-
-  // See PageIterator.
-
-  // ============= Accessing data ==============.
-
-  // Returns the null terminated UTF-8 encoded text string for the current
-  // object at the given level. Use delete [] to free after use.
-  char* GetUTF8Text(PageIteratorLevel level) const;
-
-  // Returns the mean confidence of the current object at the given level.
-  // The number should be interpreted as a percent probability. (0.0f-100.0f)
-  float Confidence(PageIteratorLevel level) const;
-
-  // ============= Functions that refer to words only ============.
-
-  // Returns the font attributes of the current word. If iterating at a higher
-  // level object than words, eg textlines, then this will return the
-  // attributes of the first word in that textline.
-  // The actual return value is a string representing a font name. It points
-  // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
-  // the iterator itself, ie rendered invalid by various members of
-  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
-  // Pointsize is returned in printers points (1/72 inch.)
-  const char* WordFontAttributes(bool* is_bold,
-                                 bool* is_italic,
-                                 bool* is_underlined,
-                                 bool* is_monospace,
-                                 bool* is_serif,
-                                 bool* is_smallcaps,
-                                 int* pointsize,
-                                 int* font_id) const;
-
-  // Returns true if the current word was found in a dictionary.
-  bool WordIsFromDictionary() const;
-
-  // Returns true if the current word is numeric.
-  bool WordIsNumeric() const;
-
-  // ============= Functions that refer to symbols only ============.
-
-  // Returns true if the current symbol is a superscript.
-  // If iterating at a higher level object than symbols, eg words, then
-  // this will return the attributes of the first symbol in that word.
-  bool SymbolIsSuperscript() const;
-  // Returns true if the current symbol is a subscript.
-  // If iterating at a higher level object than symbols, eg words, then
-  // this will return the attributes of the first symbol in that word.
-  bool SymbolIsSubscript() const;
-  // Returns true if the current symbol is a dropcap.
-  // If iterating at a higher level object than symbols, eg words, then
-  // this will return the attributes of the first symbol in that word.
-  bool SymbolIsDropcap() const;
-};
-
-// Class to iterate over the classifier choices for a single RIL_SYMBOL.
-class ChoiceIterator {
- public:
-  // Construction is from a ResultIterator that points to the symbol of
-  // interest. The ChoiceIterator allows a one-shot iteration over the
-  // choices for this symbol and after that is is useless.
-  explicit ChoiceIterator(const ResultIterator& result_it);
-  ~ChoiceIterator();
-
-  // Moves to the next choice for the symbol and returns false if there
-  // are none left.
-  bool Next();
-
-  // ============= Accessing data ==============.
-
-  // Returns the null terminated UTF-8 encoded text string for the current
-  // choice.
-  // NOTE: Unlike ResultIterator::GetUTF8Text, the return points to an
-  // internal structure and should NOT be delete[]ed to free after use.
-  const char* GetUTF8Text() const;
-
-  // Returns the confidence of the current choice.
-  // The number should be interpreted as a percent probability. (0.0f-100.0f)
-  float Confidence() const;
-
- private:
-  // Pointer to the Tesseract object owned by the API.
-  Tesseract* tesseract_;
-  // Iterator over the blob choices.
-  BLOB_CHOICE_IT* choice_it_;
-};
-
-}  // namespace tesseract.
-
-#endif  // TESSERACT_API_RESULT_ITERATOR_H__
--- a/ccmain/pageiterator.cpp
+++ b/ccmain/pageiterator.cpp
@ -36,7 +36,7 @@ PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
    rect_left_(rect_left), rect_top_(rect_top),
    rect_width_(rect_width), rect_height_(rect_height) {
  it_ = new PAGE_RES_IT(page_res);
-  Begin();
+  PageIterator::Begin();
 }

 PageIterator::~PageIterator() {
@ -73,6 +73,11 @@ const PageIterator& PageIterator::operator=(const PageIterator& src) {
  return *this;
 }

+bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
+  return (it_ == NULL && it_ == other) ||
+     ((other != NULL) && (it_ != NULL) && (*it_ == *other));
+}
+
 // ============= Moving around within the page ============.

 // Resets the iterator to point to the start of the page.
@ -81,12 +86,38 @@ void PageIterator::Begin() {
  BeginWord(0);
 }

+void PageIterator::RestartParagraph() {
+  if (it_->block() == NULL) return; // At end of the document.
+  PAGE_RES_IT para(page_res_);
+  PAGE_RES_IT next_para(para);
+  next_para.forward_paragraph();
+  while (next_para.cmp(*it_) <= 0) {
+    para = next_para;
+    next_para.forward_paragraph();
+  }
+  *it_ = para;
+  BeginWord(0);
+}
+
+bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
+  PageIterator p_start(*this);
+  p_start.RestartParagraph();
+  return p_start.it_->row() == it_->row();
+}
+
+void PageIterator::RestartRow() {
+  it_->restart_row();
+  BeginWord(0);
+}
+
 // Moves to the start of the next object at the given level in the
 // page hierarchy, and returns false if the end of the page was reached.
-// NOTE that RIL_SYMBOL will skip non-text blocks, but all other
-// PageIteratorLevel level values will visit each non-text block once.
-// Think of non text blocks as containing a single para, with a single line,
-// with a single imaginary word.
+// NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
+// non-text block at least once.
+// Think of non text blocks as containing a single para, with at least one
+// line, with a single imaginary word, containing a single symbol.
+// The bounding boxes mark out any polygonal nature of the block, and
+// PTIsTextType(BLockType()) is false for non-text blocks.
 // Calls to Next with different levels may be freely intermixed.
 // This function iterates words in right-to-left scripts correctly, if
 // the appropriate language has been loaded into Tesseract.
@ -97,9 +128,11 @@ bool PageIterator::Next(PageIteratorLevel level) {

  switch (level) {
    case RIL_BLOCK:
-    case RIL_PARA:
      it_->forward_block();
      break;
+    case RIL_PARA:
+      it_->forward_paragraph();
+      break;
    case RIL_TEXTLINE:
      for (it_->forward_with_empties(); it_->row() == it_->prev_row();
           it_->forward_with_empties());
@ -112,7 +145,7 @@ bool PageIterator::Next(PageIteratorLevel level) {
        cblob_it_->forward();
      ++blob_index_;
      if (blob_index_ >= word_length_)
-        it_->forward();
+        it_->forward_with_empties();
      else
        return true;
      break;
@ -129,10 +162,13 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
  if (it_->word() == NULL) return true;  // In an image block.
  switch (level) {
    case RIL_BLOCK:
+      return blob_index_ == 0 && it_->block() != it_->prev_block();
    case RIL_PARA:
-      return it_->block() != it_->prev_block();
+      return blob_index_ == 0 &&
+          (it_->block() != it_->prev_block() ||
+           it_->row()->row->para() != it_->prev_row()->row->para());
    case RIL_TEXTLINE:
-      return it_->row() != it_->prev_row();
+      return blob_index_ == 0 && it_->row() != it_->prev_row();
    case RIL_WORD:
      return blob_index_ == 0;
    case RIL_SYMBOL:
@ -145,7 +181,7 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
 // given level. (e.g. the last word in a line, the last line in a block)
 bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
                                    PageIteratorLevel element) const {
-  if (it_->word() == NULL) return true;  // Already at the end!
+  if (Empty(element)) return true;  // Already at the end!
  // The result is true if we step forward by element and find we are
  // at the the end of the page or at beginning of *all* levels in:
  // [level, element).
@ -154,7 +190,7 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
  // word on a line, so we also have to be at the first symbol in a word.
  PageIterator next(*this);
  next.Next(element);
-  if (next.it_->word() == NULL) return true;  // Reached the end of the page.
+  if (next.Empty(element)) return true;  // Reached the end of the page.
  while (element > level) {
    element = static_cast<PageIteratorLevel>(element - 1);
    if (!next.IsAtBeginningOf(element))
@ -163,6 +199,21 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
  return true;
 }

+// Returns whether this iterator is positioned
+//   before other:   -1
+//   equal to other:  0
+//   after other:     1
+int PageIterator::Cmp(const PageIterator &other) const {
+  int word_cmp = it_->cmp(*other.it_);
+  if (word_cmp != 0)
+    return word_cmp;
+  if (blob_index_ < other.blob_index_)
+    return -1;
+  if (blob_index_ == other.blob_index_)
+    return 0;
+  return 1;
+}
+
 // ============= Accessing data ==============.
 // Coordinate system:
 // Integer coordinates are at the cracks between the pixels.
@ -176,22 +227,25 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
 // If an image rectangle has been set in the API, then returned coordinates
 // relate to the original (full) image, rather than the rectangle.

-// Returns the bounding rectangle of the current object at the given level.
+// Returns the bounding rectangle of the current object at the given level in
+// the coordinates of the working image that is pix_binary().
 // See comment on coordinate system above.
 // Returns false if there is no such object at the current position.
-bool PageIterator::BoundingBox(PageIteratorLevel level,
-                               int* left, int* top,
-                               int* right, int* bottom) const {
-  if (it_->block() == NULL) return false;  // Already at the end!
-  if (it_->word() == NULL && level != RIL_BLOCK) return false;
-  if (level == RIL_SYMBOL && blob_index_ >= word_length_)
-    return false;  // Zero length word, or already at the end of it.
+bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
+                                       int* left, int* top,
+                                       int* right, int* bottom) const {
+  if (Empty(level))
+    return false;
  TBOX box;
+  PARA *para = NULL;
  switch (level) {
    case RIL_BLOCK:
-    case RIL_PARA:
      box = it_->block()->block->bounding_box();
      break;
+    case RIL_PARA:
+      para = it_->row()->row->para();
+      if (para == NULL) return false;
+      // explicit fall-through.
    case RIL_TEXTLINE:
      box = it_->row()->row->bounding_box();
      break;
@ -204,22 +258,59 @@ bool PageIterator::BoundingBox(PageIteratorLevel level,
      else
        box = cblob_it_->data()->bounding_box();
  }
+  if (level == RIL_PARA) {
+    PageIterator other = *this;
+    other.Begin();
+    do {
+      if (other.it_->row() && other.it_->row()->row &&
+          other.it_->row()->row->para() == para) {
+        box = box.bounding_union(other.it_->row()->row->bounding_box());
+      }
+    } while (other.Next(RIL_TEXTLINE));
+  }
  if (level != RIL_SYMBOL || cblob_it_ != NULL)
    box.rotate(it_->block()->block->re_rotation());
  // Now we have a box in tesseract coordinates relative to the image rectangle,
-  // we have to convert the coords to global page coords in a top-down system.
-  *left = ClipToRange(box.left() / scale_ + rect_left_,
+  // we have to convert the coords to a top-down system.
+  const int pix_height = pixGetHeight(tesseract_->pix_binary());
+  const int pix_width = pixGetWidth(tesseract_->pix_binary());
+  *left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
+  *top = ClipToRange(pix_height - box.top(), 0, pix_height);
+  *right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
+  *bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
+  return true;
+}
+
+// Returns the bounding rectangle of the current object at the given level in
+// coordinates of the original image.
+// See comment on coordinate system above.
+// Returns false if there is no such object at the current position.
+bool PageIterator::BoundingBox(PageIteratorLevel level,
+                               int* left, int* top,
+                               int* right, int* bottom) const {
+  if (!BoundingBoxInternal(level, left, top, right, bottom))
+    return false;
+  // Convert to the coordinate system of the original image.
+  *left = ClipToRange(*left / scale_ + rect_left_,
                      rect_left_, rect_left_ + rect_width_);
-  *top = ClipToRange((rect_height_ - box.top()) / scale_ + rect_top_,
+  *top = ClipToRange(*top / scale_ + rect_top_,
                     rect_top_, rect_top_ + rect_height_);
-  *right = ClipToRange((box.right() + scale_ - 1) / scale_ + rect_left_,
+  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_,
                       *left, rect_left_ + rect_width_);
-  *bottom = ClipToRange((rect_height_ - box.bottom() + scale_ - 1) / scale_
-                           + rect_top_,
+  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_,
                        *top, rect_top_ + rect_height_);
  return true;
 }

+// Return that there is no such object at a given level.
+bool PageIterator::Empty(PageIteratorLevel level) const {
+  if (it_->block() == NULL) return true;  // Already at the end!
+  if (it_->word() == NULL && level != RIL_BLOCK) return true;  // image block
+  if (level == RIL_SYMBOL && blob_index_ >= word_length_)
+    return true;  // Zero length word, or already at the end of it.
+  return false;
+}
+
 // Returns the type of the current block. See apitypes.h for PolyBlockType.
 PolyBlockType PageIterator::BlockType() const {
  if (it_->block() == NULL || it_->block()->block == NULL)
@ -230,7 +321,8 @@ PolyBlockType PageIterator::BlockType() const {
 }

 // Returns a binary image of the current object at the given level.
-// The position and size match the return from BoundingBox.
+// The position and size match the return from BoundingBoxInternal, and so this
+// could be upscaled with respect to the original input image.
 // Use pixDestroy to delete the image after use.
 // The following methods are used to generate the images:
 // RIL_BLOCK: mask the page image with the block polygon.
@ -250,22 +342,23 @@ PolyBlockType PageIterator::BlockType() const {
 // components.
 Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
  int left, top, right, bottom;
-  if (!BoundingBox(level, &left, &top, &right, &bottom))
+  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
    return NULL;
  Pix* pix = NULL;
  switch (level) {
    case RIL_BLOCK:
-    case RIL_PARA:
      pix = it_->block()->block->render_mask();
      // AND the mask and the image.
      pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
                  PIX_SRC & PIX_DST, tesseract_->pix_binary(),
                  left, top);
      break;
+    case RIL_PARA:
    case RIL_TEXTLINE:
    case RIL_WORD:
    case RIL_SYMBOL:
-      if (level == RIL_SYMBOL && cblob_it_ != NULL)
+      if (level == RIL_SYMBOL && cblob_it_ != NULL &&
+          cblob_it_->data()->area() != 0)
        return cblob_it_->data()->render();
      // Just clip from the bounding box.
      Box* box = boxCreate(left, top, right - left, bottom - top);
@ -301,7 +394,7 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
  Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
  Pix* grey_pix = pixClipRectangle(pix, box, NULL);
  boxDestroy(&box);
-  if (level == RIL_BLOCK || level == RIL_PARA) {
+  if (level == RIL_BLOCK) {
    Pix* mask = it_->block()->block->render_mask();
    Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
    pixRasterop(expanded_mask, padding, padding,
@ -316,7 +409,6 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
  return grey_pix;
 }

-
 // Returns the baseline of the current object at the given level.
 // The baseline is the line that passes through (x1, y1) and (x2, y2).
 // WARNING: with vertical text, baselines may be vertical!
@ -345,7 +437,7 @@ bool PageIterator::Baseline(PageIteratorLevel level,
 void PageIterator::Orientation(tesseract::Orientation *orientation,
                               tesseract::WritingDirection *writing_direction,
                               tesseract::TextlineOrder *textline_order,
-                               float *deskew_angle) {
+                               float *deskew_angle) const {
  BLOCK* block = it_->block()->block;

  // Orientation
@ -388,6 +480,22 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
  *deskew_angle = -skew.angle();
 }

+void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
+                                 bool *is_list_item,
+                                 bool *is_crown,
+                                 int *first_line_indent) const {
+  *just = tesseract::JUSTIFICATION_UNKNOWN;
+  if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
+      !it_->row()->row->para()->model)
+    return;
+
+  PARA *para = it_->row()->row->para();
+  *is_list_item = para->is_list_item;
+  *is_crown = para->is_very_first_or_continuation;
+  *first_line_indent = para->model->first_indent() -
+      para->model->body_indent();
+}
+
 // Sets up the internal data for iterating the blobs of a new word, then
 // moves the iterator to the given offset.
 void PageIterator::BeginWord(int offset) {
@ -404,6 +512,12 @@ void PageIterator::BeginWord(int offset) {
    // is already baseline denormalized.
    word_length_ = word_res->best_choice->length();
    ASSERT_HOST(word_res->box_word != NULL);
+    if (word_res->box_word->length() != word_length_) {
+      tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
+              word_length_, word_res->best_choice->unichar_string().string(),
+              word_res->box_word->length());
+      word_res->box_word->bounding_box().print();
+    }
    ASSERT_HOST(word_res->box_word->length() == word_length_);
    word_ = NULL;
    // We will be iterating the box_word.
--- a/ccmain/pageiterator.h
+++ b/ccmain/pageiterator.h
@ -18,10 +18,10 @@
 //
 ///////////////////////////////////////////////////////////////////////

-#ifndef TESSERACT_API_PAGEITERATOR_H__
-#define TESSERACT_API_PAGEITERATOR_H__
+#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
+#define TESSERACT_CCMAIN_PAGEITERATOR_H__

-#include "apitypes.h"
+#include "publictypes.h"

 class C_BLOB_IT;
 class PBLOB_IT;
@ -72,10 +72,27 @@ class PageIterator {
  PageIterator(const PageIterator& src);
  const PageIterator& operator=(const PageIterator& src);

+  // Are we positioned at the same location as other?
+  bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
+
  // ============= Moving around within the page ============.

  // Moves the iterator to point to the start of the page to begin an iteration.
-  void Begin();
+  virtual void Begin();
+
+  // Moves the iterator to the beginning of the paragraph.
+  // This class implements this functionality by moving it to the zero indexed
+  // blob of the first (leftmost) word on the first row of the paragraph.
+  virtual void RestartParagraph();
+
+  // Return whether this iterator points anywhere in the first textline of a
+  // paragraph.
+  bool IsWithinFirstTextlineOfParagraph() const;
+
+  // Moves the iterator to the beginning of the text line.
+  // This class implements this functionality by moving it to the zero indexed
+  // blob of the first (leftmost) word of the row.
+  virtual void RestartRow();

  // Moves to the start of the next object at the given level in the
  // page hierarchy, and returns false if the end of the page was reached.
@ -86,17 +103,43 @@ class PageIterator {
  // Calls to Next with different levels may be freely intermixed.
  // This function iterates words in right-to-left scripts correctly, if
  // the appropriate language has been loaded into Tesseract.
-  bool Next(PageIteratorLevel level);
+  virtual bool Next(PageIteratorLevel level);

  // Returns true if the iterator is at the start of an object at the given
-  // level. Possible uses include determining if a call to Next(RIL_WORD)
-  // moved to the start of a RIL_PARA.
-  bool IsAtBeginningOf(PageIteratorLevel level) const;
+  // level.
+  //
+  // For instance, suppose an iterator it is pointed to the first symbol of the
+  // first word of the third line of the second paragraph of the first block in
+  // a page, then:
+  //   it.IsAtBeginningOf(RIL_BLOCK) = false
+  //   it.IsAtBeginningOf(RIL_PARA) = false
+  //   it.IsAtBeginningOf(RIL_TEXTLINE) = true
+  //   it.IsAtBeginningOf(RIL_WORD) = true
+  //   it.IsAtBeginningOf(RIL_SYMBOL) = true
+  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;

  // Returns whether the iterator is positioned at the last element in a
  // given level. (e.g. the last word in a line, the last line in a block)
-  bool IsAtFinalElement(PageIteratorLevel level,
-                        PageIteratorLevel element) const;
+  //
+  //     Here's some two-paragraph example
+  //   text.  It starts off innocuously
+  //   enough but quickly turns bizarre.
+  //     The author inserts a cornucopia
+  //   of words to guard against confused
+  //   references.
+  //
+  // Now take an iterator it pointed to the start of "bizarre."
+  //  it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
+  //  it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
+  //  it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
+  virtual bool IsAtFinalElement(PageIteratorLevel level,
+                                PageIteratorLevel element) const;
+
+  // Returns whether this iterator is positioned
+  //   before other:   -1
+  //   equal to other:  0
+  //   after other:     1
+  int Cmp(const PageIterator &other) const;

  // ============= Accessing data ==============.
  // Coordinate system:
@ -120,12 +163,21 @@ class PageIterator {
  // the image to include more foreground pixels. See GetImage below.
  bool BoundingBox(PageIteratorLevel level,
                   int* left, int* top, int* right, int* bottom) const;
+  // Returns the bounding rectangle of the object in a coordinate system of the
+  // working image rectangle having its origin at (rect_left_, rect_top_) with
+  // respect to the original image and is scaled by a factor scale_.
+  bool BoundingBoxInternal(PageIteratorLevel level,
+                           int* left, int* top, int* right, int* bottom) const;
+
+  // Returns whether there is no object of a given level.
+  bool Empty(PageIteratorLevel level) const;

  // Returns the type of the current block. See apitypes.h for PolyBlockType.
  PolyBlockType BlockType() const;

  // Returns a binary image of the current object at the given level.
-  // The position and size match the return from BoundingBox.
+  // The position and size match the return from BoundingBoxInternal, and so
+  // this could be upscaled with respect to the original input image.
  // Use pixDestroy to delete the image after use.
  Pix* GetBinaryImage(PageIteratorLevel level) const;

@ -156,7 +208,38 @@ class PageIterator {
  void Orientation(tesseract::Orientation *orientation,
                   tesseract::WritingDirection *writing_direction,
                   tesseract::TextlineOrder *textline_order,
-                   float *deskew_angle);
+                   float *deskew_angle) const;
+
+  // Returns information about the current paragraph, if available.
+  //
+  //   justification -
+  //     LEFT if ragged right, or fully justified and script is left-to-right.
+  //     RIGHT if ragged left, or fully justified and script is right-to-left.
+  //     unknown if it looks like source code or we have very few lines.
+  //   is_list_item -
+  //     true if we believe this is a member of an ordered or unordered list.
+  //   is_crown -
+  //     true if the first line of the paragraph is aligned with the other
+  //     lines of the paragraph even though subsequent paragraphs have first
+  //     line indents.  This typically indicates that this is the continuation
+  //     of a previous paragraph or that it is the very first paragraph in
+  //     the chapter.
+  //   first_line_indent -
+  //     For LEFT aligned paragraphs, the first text line of paragraphs of
+  //     this kind are indented this many pixels from the left edge of the
+  //     rest of the paragraph.
+  //     for RIGHT aligned paragraphs, the first text line of paragraphs of
+  //     this kind are indented this many pixels from the right edge of the
+  //     rest of the paragraph.
+  //     NOTE 1: This value may be negative.
+  //     NOTE 2: if *is_crown == true, the first line of this paragraph is
+  //             actually flush, and first_line_indent is set to the "common"
+  //             first_line_indent for subsequent paragraphs in this block
+  //             of text.
+  void ParagraphInfo(tesseract::ParagraphJustification *justification,
+                     bool *is_list_item,
+                     bool *is_crown,
+                     int *first_line_indent) const;

 protected:
  // Sets up the internal data for iterating the blobs of a new word, then
@ -192,4 +275,4 @@ class PageIterator {

 }  // namespace tesseract.

-#endif  // TESSERACT_API_PAGEITERATOR_H__
+#endif  // TESSERACT_CCMAIN_PAGEITERATOR_H__
--- a/ccmain/resultiterator.cpp
+++ b/ccmain/resultiterator.cpp
@ -0,0 +1,663 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.cpp
+// Description: Iterator for tesseract results that is capable of
+//              iterating in proper reading order over Bi Directional
+//              (e.g. mixed Hebrew and English) text.
+// Author:      David Eger
+// Created:     Fri May 27 13:58:06 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "resultiterator.h"
+
+#include "allheaders.h"
+#include "pageres.h"
+#include "strngs.h"
+#include "tesseractclass.h"
+#include "unicharset.h"
+#include "unicodes.h"
+
+namespace tesseract {
+
+ResultIterator::ResultIterator(const LTRResultIterator &resit)
+    : LTRResultIterator(resit) {
+  in_minor_direction_ = false;
+  at_beginning_of_minor_run_ = false;
+  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+  MoveToLogicalStartOfTextline();
+}
+
+ResultIterator *ResultIterator::StartOfParagraph(
+    const LTRResultIterator &resit) {
+  return new ResultIterator(resit);
+}
+
+bool ResultIterator::ParagraphIsLtr() const {
+  return current_paragraph_is_ltr_;
+}
+
+bool ResultIterator::CurrentParagraphIsLtr() const {
+  if (!it_->word())
+    return true;  // doesn't matter.
+  LTRResultIterator it(*this);
+  it.RestartParagraph();
+  // Try to figure out the ltr-ness of the paragraph.  The rules below
+  // make more sense in the context of a difficult paragraph example.
+  // Here we denote {ltr characters, RTL CHARACTERS}:
+  //
+  //   "don't go in there!" DAIS EH
+  //   EHT OTNI DEPMUJ FELSMIH NEHT DNA
+  //                  .GNIDLIUB GNINRUB
+  //
+  // On the first line, the left-most word is LTR and the rightmost word
+  // is RTL.  Thus, we are better off taking the majority direction for
+  // the whole paragraph contents.  So instead of "the leftmost word is LTR"
+  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
+  // would not do:  Typically an RTL paragraph would *not* start with an LTR
+  // word.  So our heuristics are as follows:
+  //
+  // (1) If the first text line has an RTL word in the left-most position
+  //     it is RTL.
+  // (2) If the first text line has an LTR word in the right-most position
+  //     it is LTR.
+  // (3) If neither of the above is true, take the majority count for the
+  //     paragraph -- if there are more rtl words, it is RTL.  If there
+  //     are more LTR words, it's LTR.
+  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
+  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
+  int num_ltr, num_rtl;
+  num_rtl = leftmost_rtl ? 1 : 0;
+  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+  for (it.Next(RIL_WORD);
+       !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
+       it.Next(RIL_WORD)) {
+    StrongScriptDirection dir = it.WordDirection();
+    rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
+    num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+    num_ltr += rightmost_ltr ? 1 : 0;
+  }
+  if (leftmost_rtl)
+    return false;
+  if (rightmost_ltr)
+    return true;
+  // First line is ambiguous.  Take statistics on the whole paragraph.
+  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
+    StrongScriptDirection dir = it.WordDirection();
+    num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+    num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
+  return num_ltr >= num_rtl;
+}
+
+const int ResultIterator::kMinorRunStart = -1;
+const int ResultIterator::kMinorRunEnd = -2;
+const int ResultIterator::kComplexWord = -3;
+
+void ResultIterator::CalculateBlobOrder(
+    GenericVector<int> *blob_indices) const {
+  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+  blob_indices->clear();
+  if (Empty(RIL_WORD)) return;
+  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
+    // Easy! just return the blobs in order;
+    for (int i = 0; i < word_length_; i++)
+      blob_indices->push_back(i);
+    return;
+  }
+
+  // The blobs are in left-to-right order, but the current reading context
+  // is right-to-left.
+  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
+  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
+  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
+  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
+  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
+  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
+  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
+
+  // Step 1: Scan for and mark European Number sequences
+  //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
+  GenericVector<int> letter_types;
+  for (int i = 0; i < word_length_; i++) {
+    letter_types.push_back(it_->word()->SymbolDirection(i));
+  }
+  // Convert a single separtor sandwiched between two EN's into an EN.
+  for (int i = 0; i + 2 < word_length_; i++) {
+    if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
+        (letter_types[i + 1] == U_EURO_NUM_SEP ||
+         letter_types[i + 1] == U_COMMON_NUM_SEP)) {
+      letter_types[i + 1] = U_EURO_NUM;
+    }
+  }
+  // Scan for sequences of European Number Terminators around ENs and convert
+  // them to ENs.
+  for (int i = 0; i < word_length_; i++) {
+    if (letter_types[i] == U_EURO_NUM_TERM) {
+      int j = i + 1;
+      while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
+      if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
+        // The sequence [i..j] should be converted to all European Numbers.
+        for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
+      }
+      j = i - 1;
+      while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
+      if (j > -1 && letter_types[j] == U_EURO_NUM) {
+        // The sequence [j..i] should be converted to all European Numbers.
+        for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
+      }
+    }
+  }
+  // Step 2: Convert all remaining types to either L or R.
+  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
+  // All other are R.
+  for (int i = 0; i < word_length_;) {
+    int ti = letter_types[i];
+    if (ti == U_LTR || ti == U_EURO_NUM) {
+      // Left to right sequence; scan to the end of it.
+      int last_good = i;
+      for (int j = i + 1; j < word_length_; j++) {
+        int tj = letter_types[j];
+        if (tj == U_LTR || tj == U_EURO_NUM) {
+          last_good = j;
+        } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
+          // do nothing.
+        } else {
+          break;
+        }
+      }
+      // [i..last_good] is the L sequence
+      for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
+      i = last_good + 1;
+    } else {
+      letter_types[i] = U_RTL;
+      i++;
+    }
+  }
+
+  // At this point, letter_types is entirely U_LTR or U_RTL.
+  for (int i = word_length_ - 1; i >= 0;) {
+    if (letter_types[i] == U_RTL) {
+      blob_indices->push_back(i);
+      i--;
+    } else {
+      // left to right sequence.  scan to the beginning.
+      int j = i - 1;
+      for (; j >= 0 && letter_types[j] != U_RTL; j--) { }  // pass
+      // Now (j, i] is LTR
+      for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
+      i = j;
+    }
+  }
+  ASSERT_HOST(blob_indices->size() == word_length_);
+}
+
+static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
+  for (int i = 0; i < dirs.size(); i++) {
+    switch (dirs[i]) {
+      case DIR_NEUTRAL: tprintf ("N "); break;
+      case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
+      case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
+      case DIR_MIX: tprintf("Z "); break;
+      default: tprintf("? "); break;
+    }
+  }
+  tprintf("\n");
+}
+
+void ResultIterator::CalculateTextlineOrder(
+    bool paragraph_is_ltr,
+    const LTRResultIterator &resit,
+    GenericVectorEqEq<int> *word_indices) const {
+  GenericVector<StrongScriptDirection> directions;
+  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(
+    bool paragraph_is_ltr,
+    const LTRResultIterator &resit,
+    GenericVector<StrongScriptDirection> *dirs_arg,
+    GenericVectorEqEq<int> *word_indices) const {
+  GenericVector<StrongScriptDirection> dirs;
+  GenericVector<StrongScriptDirection> *directions;
+  directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
+  directions->truncate(0);
+
+  // A LTRResultIterator goes strictly left-to-right word order.
+  LTRResultIterator ltr_it(resit);
+  ltr_it.RestartRow();
+  if (ltr_it.Empty(RIL_WORD)) return;
+  do {
+    directions->push_back(ltr_it.WordDirection());
+  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
+
+  word_indices->truncate(0);
+  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(
+    bool paragraph_is_ltr,
+    const GenericVector<StrongScriptDirection> &word_dirs,
+    GenericVectorEqEq<int> *reading_order) {
+  reading_order->truncate(0);
+  if (word_dirs.size() == 0) return;
+
+  // Take all of the runs of minor direction words and insert them
+  // in reverse order.
+  int minor_direction, major_direction, major_step, start, end;
+  if (paragraph_is_ltr) {
+    start = 0;
+    end = word_dirs.size();
+    major_step = 1;
+    major_direction = DIR_LEFT_TO_RIGHT;
+    minor_direction = DIR_RIGHT_TO_LEFT;
+  } else {
+    start = word_dirs.size() - 1;
+    end = -1;
+    major_step = -1;
+    major_direction = DIR_RIGHT_TO_LEFT;
+    minor_direction = DIR_LEFT_TO_RIGHT;
+    // Special rule: if there are neutral words at the right most side
+    //   of a line adjacent to a left-to-right word in the middle of the
+    //   line, we interpret the end of the line as a single LTR sequence.
+    if (word_dirs[start] == DIR_NEUTRAL) {
+      int neutral_end = start;
+      while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
+        neutral_end--;
+      }
+      if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
+        // LTR followed by neutrals.
+        // Scan for the beginning of the minor left-to-right run.
+        int left = neutral_end;
+        for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
+          if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
+        }
+        reading_order->push_back(kMinorRunStart);
+        for (int i = left; i < word_dirs.size(); i++) {
+          reading_order->push_back(i);
+          if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
+        }
+        reading_order->push_back(kMinorRunEnd);
+        start = left - 1;
+      }
+    }
+  }
+  for (int i = start; i != end;) {
+    if (word_dirs[i] == minor_direction) {
+      int j = i;
+      while (j != end && word_dirs[j] != major_direction)
+        j += major_step;
+      if (j == end) j -= major_step;
+      while (j != i && word_dirs[j] != minor_direction)
+        j -= major_step;
+      //  [j..i] is a minor direction run.
+      reading_order->push_back(kMinorRunStart);
+      for (int k = j; k != i; k -= major_step) {
+        reading_order->push_back(k);
+      }
+      reading_order->push_back(i);
+      reading_order->push_back(kMinorRunEnd);
+      i = j + major_step;
+    } else {
+      reading_order->push_back(i);
+      if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
+      i += major_step;
+    }
+  }
+}
+
+int ResultIterator::LTRWordIndex() const {
+  int this_word_index = 0;
+  LTRResultIterator textline(*this);
+  textline.RestartRow();
+  while (!textline.PositionedAtSameWord(it_)) {
+    this_word_index++;
+    textline.Next(RIL_WORD);
+  }
+  return this_word_index;
+}
+
+void ResultIterator::MoveToLogicalStartOfWord() {
+  if (word_length_ == 0) {
+    BeginWord(0);
+    return;
+  }
+  GenericVector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  if (blob_order.size() == 0 || blob_order[0] == 0) return;
+  BeginWord(blob_order[0]);
+}
+
+bool ResultIterator::IsAtFinalSymbolOfWord() const {
+  if (!it_->word()) return true;
+  GenericVector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  return blob_order.size() == 0 || blob_order.back() == blob_index_;
+}
+
+bool ResultIterator::IsAtFirstSymbolOfWord() const {
+  if (!it_->word()) return true;
+  GenericVector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  return blob_order.size() == 0 || blob_order[0] == blob_index_;
+}
+
+void ResultIterator::AppendSuffixMarks(STRING *text) const {
+  if (!it_->word()) return;
+  bool reading_direction_is_ltr =
+      current_paragraph_is_ltr_ ^ in_minor_direction_;
+  // scan forward to see what meta-information the word ordering algorithm
+  // left us.
+  // If this word is at the  *end* of a minor run, insert the other
+  // direction's mark;  else if this was a complex word, insert the
+  // current reading order's mark.
+  GenericVectorEqEq<int> textline_order;
+  CalculateTextlineOrder(current_paragraph_is_ltr_,
+                         *this, &textline_order);
+  int this_word_index = LTRWordIndex();
+  int i = textline_order.get_index(this_word_index);
+  if (i < 0) return;
+
+  int last_non_word_mark = 0;
+  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
+    last_non_word_mark = textline_order[i];
+  }
+  if (last_non_word_mark == kComplexWord) {
+    *text += reading_direction_is_ltr ? kLRM : kRLM;
+  } else if (last_non_word_mark == kMinorRunEnd) {
+    if (current_paragraph_is_ltr_) {
+      *text += kRLM;
+      *text += kLRM;
+    } else {
+      *text += kRLM;
+      *text += kLRM;
+    }
+  }
+}
+
+void ResultIterator::MoveToLogicalStartOfTextline() {
+  GenericVectorEqEq<int> word_indices;
+  RestartRow();
+  CalculateTextlineOrder(current_paragraph_is_ltr_,
+                         dynamic_cast<const LTRResultIterator&>(*this),
+                         &word_indices);
+  int i = 0;
+  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
+    if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
+    else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
+  }
+  if (in_minor_direction_) at_beginning_of_minor_run_ = true;
+  if (i >= word_indices.size()) return;
+  int first_word_index = word_indices[i];
+  for (int j = 0; j < first_word_index; j++) {
+    PageIterator::Next(RIL_WORD);
+  }
+  MoveToLogicalStartOfWord();
+}
+
+void ResultIterator::Begin() {
+  LTRResultIterator::Begin();
+  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+  in_minor_direction_ = false;
+  at_beginning_of_minor_run_ = false;
+  MoveToLogicalStartOfTextline();
+}
+
+bool ResultIterator::Next(PageIteratorLevel level) {
+  if (it_->block() == NULL) return false; // already at end!
+  switch (level) {
+    case RIL_BLOCK:  // explicit fall-through
+    case RIL_PARA:   // explicit fall-through
+    case RIL_TEXTLINE:
+      if (!PageIterator::Next(level)) return false;
+      if (IsWithinFirstTextlineOfParagraph()) {
+        // if we've advanced to a new paragraph,
+        // recalculate current_paragraph_is_ltr_
+        current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+      }
+      in_minor_direction_ = false;
+      MoveToLogicalStartOfTextline();
+      return it_->block() != NULL;
+    case RIL_SYMBOL:
+    {
+      GenericVector<int> blob_order;
+      CalculateBlobOrder(&blob_order);
+      int next_blob = 0;
+      while (next_blob < blob_order.size() &&
+             blob_index_ != blob_order[next_blob])
+        next_blob++;
+      next_blob++;
+      if (next_blob < blob_order.size()) {
+        // we're in the same word; simply advance one blob.
+        BeginWord(blob_order[next_blob]);
+        at_beginning_of_minor_run_ = false;
+        return true;
+      }
+      level = RIL_WORD;  // we've fallen through to the next word.
+    }
+    case RIL_WORD:  // explicit fall-through.
+    {
+      if (it_->word() == NULL) return Next(RIL_BLOCK);
+      GenericVectorEqEq<int> word_indices;
+      int this_word_index = LTRWordIndex();
+      CalculateTextlineOrder(current_paragraph_is_ltr_,
+                             *this,
+                             &word_indices);
+      int final_real_index = word_indices.size() - 1;
+      while (final_real_index > 0 && word_indices[final_real_index] < 0)
+        final_real_index--;
+      for (int i = 0; i < final_real_index; i++) {
+        if (word_indices[i] == this_word_index) {
+          int j = i + 1;
+          for (; j < final_real_index && word_indices[j] < 0; j++) {
+            if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
+            if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
+          }
+          at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
+          // awesome, we move to word_indices[j]
+          if (BidiDebug(3)) {
+            tprintf("Next(RIL_WORD): %d -> %d\n",
+                    this_word_index, word_indices[j]);
+          }
+          PageIterator::RestartRow();
+          for (int k = 0; k < word_indices[j]; k++) {
+            PageIterator::Next(RIL_WORD);
+          }
+          MoveToLogicalStartOfWord();
+          return true;
+        }
+      }
+      if (BidiDebug(3)) {
+        tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
+      }
+      // we're going off the end of the text line.
+      return Next(RIL_TEXTLINE);
+    }
+  }
+  ASSERT_HOST(false);  // shouldn't happen.
+  return false;
+}
+
+bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+  if (it_->block() == NULL) return false;  // Already at the end!
+  if (it_->word() == NULL) return true;  // In an image block.
+  if (level == RIL_SYMBOL) return true;  // Always at beginning of a symbol.
+
+  bool at_word_start = IsAtFirstSymbolOfWord();
+  if (level == RIL_WORD) return at_word_start;
+
+  ResultIterator line_start(*this);
+  // move to the first word in the line...
+  line_start.MoveToLogicalStartOfTextline();
+
+  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
+  if (level == RIL_TEXTLINE) return at_textline_start;
+
+  // now we move to the left-most word...
+  line_start.RestartRow();
+  bool at_block_start = at_textline_start &&
+      line_start.it_->block() != line_start.it_->prev_block();
+  if (level == RIL_BLOCK) return at_block_start;
+
+  bool at_para_start = at_block_start ||
+      (at_textline_start &&
+       line_start.it_->row()->row->para() !=
+           line_start.it_->prev_row()->row->para());
+  if (level == RIL_PARA) return at_para_start;
+
+  ASSERT_HOST(false);  // shouldn't happen.
+  return false;
+}
+
+// NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
+//   change that the variable next is now a ResultIterator instead of a
+//   PageIterator.
+bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
+                                      PageIteratorLevel element) const {
+  if (Empty(element)) return true;  // Already at the end!
+  // The result is true if we step forward by element and find we are
+  // at the the end of the page or at beginning of *all* levels in:
+  // [level, element).
+  // When there is more than one level difference between element and level,
+  // we could for instance move forward one symbol and still be at the first
+  // word on a line, so we also have to be at the first symbol in a word.
+  ResultIterator next(*this);
+  next.Next(element);
+  if (next.Empty(element)) return true;  // Reached the end of the page.
+  while (element > level) {
+    element = static_cast<PageIteratorLevel>(element - 1);
+    if (!next.IsAtBeginningOf(element))
+      return false;
+  }
+  return true;
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  STRING text;
+  switch (level) {
+    case RIL_BLOCK:
+      {
+        ResultIterator pp(*this);
+        do {
+          pp.AppendUTF8ParagraphText(&text);
+        } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
+      }
+      break;
+    case RIL_PARA:
+      AppendUTF8ParagraphText(&text);
+      break;
+    case RIL_TEXTLINE:
+      {
+        ResultIterator it(*this);
+        it.MoveToLogicalStartOfTextline();
+        it.IterateAndAppendUTF8TextlineText(&text);
+      }
+      break;
+    case RIL_WORD:
+      AppendUTF8WordText(&text);
+      break;
+    case RIL_SYMBOL:
+      {
+        bool reading_direction_is_ltr =
+          current_paragraph_is_ltr_ ^ in_minor_direction_;
+        if (at_beginning_of_minor_run_) {
+          text += reading_direction_is_ltr ? kLRM : kRLM;
+        }
+        text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
+        if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
+      }
+      break;
+  }
+  int length = text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, text.string(), length);
+  return result;
+}
+
+void ResultIterator::AppendUTF8WordText(STRING *text) const {
+  if (!it_->word()) return;
+  ASSERT_HOST(it_->word()->best_choice != NULL);
+  bool reading_direction_is_ltr =
+      current_paragraph_is_ltr_ ^ in_minor_direction_;
+  if (at_beginning_of_minor_run_) {
+    *text += reading_direction_is_ltr ? kLRM : kRLM;
+  }
+
+  GenericVector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  for (int i = 0; i < blob_order.size(); i++) {
+    *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
+  }
+  AppendSuffixMarks(text);
+}
+
+void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
+  if (Empty(RIL_WORD)) {
+    Next(RIL_WORD);
+    return;
+  }
+  if (BidiDebug(1)) {
+    GenericVectorEqEq<int> textline_order;
+    GenericVector<StrongScriptDirection> dirs;
+    CalculateTextlineOrder(current_paragraph_is_ltr_,
+                           *this, &dirs, &textline_order);
+    tprintf("Strong Script dirs     [%p/P=%s]: ", it_->row(),
+            current_paragraph_is_ltr_ ? "ltr" : "rtl");
+    PrintScriptDirs(dirs);
+    tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
+            current_paragraph_is_ltr_ ? "ltr" : "rtl");
+    for (int i = 0; i < textline_order.size(); i++) {
+      tprintf("%d ", textline_order[i]);
+    }
+    tprintf("\n");
+  }
+
+  int words_appended = 0;
+  do {
+    AppendUTF8WordText(text);
+    words_appended++;
+    *text += " ";
+  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
+  if (BidiDebug(1)) {
+    tprintf("%d words printed\n", words_appended);
+  }
+  text->truncate_at(text->length() - 1);
+  *text += line_separator_;
+  // If we just finished a paragraph, add an extra newline.
+  if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
+    *text += paragraph_separator_;
+}
+
+void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
+  ResultIterator it(*this);
+  it.RestartParagraph();
+  it.MoveToLogicalStartOfTextline();
+  if (it.Empty(RIL_WORD)) return;
+  do {
+    it.IterateAndAppendUTF8TextlineText(text);
+  } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
+}
+
+bool ResultIterator::BidiDebug(int min_level) const {
+  int debug_level = 1;
+  IntParam *p = ParamUtils::FindParam<IntParam>(
+      "bidi_debug", GlobalParams()->int_params,
+      tesseract_->params()->int_params);
+  if (p != NULL) debug_level = (inT32)(*p);
+  return debug_level >= min_level;
+}
+
+}  // namespace tesseract.
--- a/ccmain/resultiterator.h
+++ b/ccmain/resultiterator.h
@ -0,0 +1,198 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.h
+// Description: Iterator for tesseract results that is capable of
+//              iterating in proper reading order over Bi Directional
+//              (e.g. mixed Hebrew and English) text.
+// Author:      David Eger
+// Created:     Fri May 27 13:58:06 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__
+#define TESSERACT_CCMAIN_RESULT_ITERATOR_H__
+
+#include "ltrresultiterator.h"
+#include "genericvector.h"
+
+class BLOB_CHOICE_IT;
+class WERD_RES;
+class STRING;
+
+namespace tesseract {
+
+class Tesseract;
+
+class ResultIterator : public LTRResultIterator {
+ public:
+  static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
+
+  // ResultIterator is copy constructible!
+  // The default copy constructor works just fine for us.
+  virtual ~ResultIterator() {}
+
+  // ============= Moving around within the page ============.
+  // Moves the iterator to point to the start of the page to begin an iteration.
+  virtual void Begin();
+
+  // Moves to the start of the next object at the given level in the
+  // page hierarchy in the appropriate reading order and returns false if
+  // the end of the page was reached.
+  // NOTE that RIL_SYMBOL will skip non-text blocks, but all other
+  // PageIteratorLevel level values will visit each non-text block once.
+  // Think of non text blocks as containing a single para, with a single line,
+  // with a single imaginary word.
+  // Calls to Next with different levels may be freely intermixed.
+  // This function iterates words in right-to-left scripts correctly, if
+  // the appropriate language has been loaded into Tesseract.
+  virtual bool Next(PageIteratorLevel level);
+
+  // IsAtBeginningOf() returns whether we're at the logical beginning of the
+  // given level.  (as opposed to ResultIterator's left-to-right top-to-bottom
+  // order).  Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
+  // For a full description, see pageiterator.h
+  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
+
+  // Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
+  // For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
+  // point at the last word in a paragraph.  See PageIterator for full comment.
+  virtual bool IsAtFinalElement(PageIteratorLevel level,
+                                PageIteratorLevel element) const;
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // object at the given level. Use delete [] to free after use.
+  virtual char* GetUTF8Text(PageIteratorLevel level) const;
+
+  // Return whether the current paragraph's dominant reading direction
+  // is left-to-right (as opposed to right-to-left).
+  bool ParagraphIsLtr() const;
+
+  // ============= Exposed only for testing =============.
+
+  // Yields the reading order as a sequence of indices and (optional)
+  // meta-marks for a set of words (given left-to-right).
+  // The meta marks are passed as negative values:
+  //   kMinorRunStart  Start of minor direction text.
+  //   kMinorRunEnd    End of minor direction text.
+  //   kComplexWord    The next indexed word contains both left-to-right and
+  //                    right-to-left characters and was treated as neutral.
+  //
+  // For example, suppose we have five words in a text line,
+  // indexed [0,1,2,3,4] from the leftmost side of the text line.
+  // The following are all believable reading_orders:
+  //
+  // Left-to-Right (in ltr paragraph):
+  //     { 0, 1, 2, 3, 4 }
+  // Left-to-Right (in rtl paragraph):
+  //     { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
+  // Right-to-Left (in rtl paragraph):
+  //     { 4, 3, 2, 1, 0 }
+  // Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
+  //     { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
+  static void CalculateTextlineOrder(
+      bool paragraph_is_ltr,
+      const GenericVector<StrongScriptDirection> &word_dirs,
+      GenericVectorEqEq<int> *reading_order);
+
+  static const int kMinorRunStart;
+  static const int kMinorRunEnd;
+  static const int kComplexWord;
+
+ protected:
+  // We presume the data associated with the given iterator will outlive us.
+  // NB: This is private because it does something that is non-obvious:
+  //   it resets to the beginning of the paragraph instead of staying wherever
+  //   resit might have pointed.
+  explicit ResultIterator(const LTRResultIterator &resit);
+
+ private:
+  // Calculates the current paragraph's dominant writing direction.
+  // Typically, members should use current_paragraph_ltr_ instead.
+  bool CurrentParagraphIsLtr() const;
+
+  // Returns word indices as measured from resit->RestartRow() = index 0
+  // for the reading order of words within a textline given an iterator
+  // into the middle of the text line.
+  // In addition to non-negative word indices, the following negative values
+  // may be inserted:
+  //   kMinorRunStart  Start of minor direction text.
+  //   kMinorRunEnd    End of minor direction text.
+  //   kComplexWord    The previous word contains both left-to-right and
+  //                   right-to-left characters and was treated as neutral.
+  void CalculateTextlineOrder(bool paragraph_is_ltr,
+                              const LTRResultIterator &resit,
+                              GenericVectorEqEq<int> *indices) const;
+  // Same as above, but the caller's ssd gets filled in if ssd != NULL.
+  void CalculateTextlineOrder(bool paragraph_is_ltr,
+                              const LTRResultIterator &resit,
+                              GenericVector<StrongScriptDirection> *ssd,
+                              GenericVectorEqEq<int> *indices) const;
+
+  // What is the index of the current word in a strict left-to-right reading
+  // of the row?
+  int LTRWordIndex() const;
+
+  // Given an iterator pointing at a word, returns the logical reading order
+  // of blob indices for the word.
+  void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
+
+  // Precondition: current_paragraph_is_ltr_ is set.
+  void MoveToLogicalStartOfTextline();
+
+  // Precondition: current_paragraph_is_ltr_ and in_minor_direction_ are set.
+  void MoveToLogicalStartOfWord();
+
+  // Are we pointing at the final (reading order) symbol of the word?
+  bool IsAtFinalSymbolOfWord() const;
+
+  // Are we pointing at the first (reading order) symbol of the word?
+  bool IsAtFirstSymbolOfWord() const;
+
+  // Append any extra marks that should be appended to this word when printed.
+  // Mostly, these are Unicode BiDi control characters.
+  void AppendSuffixMarks(STRING *text) const;
+
+  // Appends the current word in reading order to the given buffer.
+  void AppendUTF8WordText(STRING *text) const;
+
+  // Appends the text of the current text line, *assuming this iterator is
+  // positioned at the beginning of the text line*  This function
+  // updates the iterator to point to the first position past the text line.
+  // Each textline is terminated in a single newline character.
+  // If the textline ends a paragraph, it gets a second terminal newline.
+  void IterateAndAppendUTF8TextlineText(STRING *text);
+
+  // Appends the text of the current paragraph in reading order
+  // to the given buffer.
+  // Each textline is terminated in a single newline character, and the
+  // paragraph gets an extra newline at the end.
+  void AppendUTF8ParagraphText(STRING *text) const;
+
+  // Returns whether the bidi_debug flag is set to at least min_level.
+  bool BidiDebug(int min_level) const;
+
+  bool current_paragraph_is_ltr_;
+
+  // Is the currently pointed-at character at the beginning of
+  // a minor-direction run?
+  bool at_beginning_of_minor_run_;
+
+  // Is the currently pointed-at character in a minor-direction sequence?
+  bool in_minor_direction_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_CCMAIN_RESULT_ITERATOR_H__