tesseract/src/ccmain/resultiterator.h

///////////////////////////////////////////////////////////////////////
// File:        resultiterator.h
// Description: Iterator for tesseract results that is capable of
//              iterating in proper reading order over Bi Directional
//              (e.g. mixed Hebrew and English) text.
// Author:      David Eger
// Created:     Fri May 27 13:58:06 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_

#include <set>                  // for std::pair
#include <vector>               // for std::vector
#include "ltrresultiterator.h"  // for LTRResultIterator
#include "platform.h"           // for TESS_API, TESS_LOCAL
#include "publictypes.h"        // for PageIteratorLevel
#include "unichar.h"            // for StrongScriptDirection

template <typename T> class GenericVector;
template <typename T> class GenericVectorEqEq;

class STRING;

namespace tesseract {

class Tesseract;

class TESS_API ResultIterator : public LTRResultIterator {
 public:
  static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);

  /**
   * ResultIterator is copy constructible!
   * The default copy constructor works just fine for us.
   */
  virtual ~ResultIterator() = default;

  // ============= Moving around within the page ============.
  /**
   * Moves the iterator to point to the start of the page to begin
   * an iteration.
   */
  virtual void Begin();

  /**
   * Moves to the start of the next object at the given level in the
   * page hierarchy in the appropriate reading order and returns false if
   * the end of the page was reached.
   * NOTE that RIL_SYMBOL will skip non-text blocks, but all other
   * PageIteratorLevel level values will visit each non-text block once.
   * Think of non text blocks as containing a single para, with a single line,
   * with a single imaginary word.
   * Calls to Next with different levels may be freely intermixed.
   * This function iterates words in right-to-left scripts correctly, if
   * the appropriate language has been loaded into Tesseract.
   */
  virtual bool Next(PageIteratorLevel level);

  /**
   * IsAtBeginningOf() returns whether we're at the logical beginning of the
   * given level.  (as opposed to ResultIterator's left-to-right top-to-bottom
   * order).  Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
   * For a full description, see pageiterator.h
   */
  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;

  /**
   * Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
   * For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
   * point at the last word in a paragraph.  See PageIterator for full comment.
  */
  virtual bool IsAtFinalElement(PageIteratorLevel level,
                                PageIteratorLevel element) const;

  // ============= Functions that refer to words only ============.
  // Returns the number of blanks before the current word.
  int BlanksBeforeWord() const;

  // ============= Accessing data ==============.

  /**
   * Returns the null terminated UTF-8 encoded text string for the current
   * object at the given level. Use delete [] to free after use.
  */
  virtual char* GetUTF8Text(PageIteratorLevel level) const;

  /**
   * Returns the LSTM choices for every LSTM timestep for the current word.
  */
  virtual std::vector<std::vector<std::pair<const char*, float>>>*
  GetRawLSTMTimesteps() const;
  virtual std::vector<std::vector<std::pair<const char*, float>>>*
    GetBestLSTMSymbolChoices() const;
  virtual std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
    GetSegmentedLSTMTimesteps() const;

  /**
   * Return whether the current paragraph's dominant reading direction
   * is left-to-right (as opposed to right-to-left).
  */
  bool ParagraphIsLtr() const;

  // ============= Exposed only for testing =============.

  /**
   * Yields the reading order as a sequence of indices and (optional)
   * meta-marks for a set of words (given left-to-right).
   * The meta marks are passed as negative values:
   *   kMinorRunStart  Start of minor direction text.
   *   kMinorRunEnd    End of minor direction text.
   *   kComplexWord    The next indexed word contains both left-to-right and
   *                    right-to-left characters and was treated as neutral.
   *
   * For example, suppose we have five words in a text line,
   * indexed [0,1,2,3,4] from the leftmost side of the text line.
   * The following are all believable reading_orders:
   *
   * Left-to-Right (in ltr paragraph):
   *     { 0, 1, 2, 3, 4 }
   * Left-to-Right (in rtl paragraph):
   *     { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
   * Right-to-Left (in rtl paragraph):
   *     { 4, 3, 2, 1, 0 }
   * Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
   *     { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
   */
  static void CalculateTextlineOrder(
      bool paragraph_is_ltr,
      const GenericVector<StrongScriptDirection> &word_dirs,
      GenericVectorEqEq<int> *reading_order);

  static const int kMinorRunStart;
  static const int kMinorRunEnd;
  static const int kComplexWord;

 protected:
  /**
   * We presume the data associated with the given iterator will outlive us.
   * NB: This is private because it does something that is non-obvious:
   *   it resets to the beginning of the paragraph instead of staying wherever
   *   resit might have pointed.
   */
  TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);

 private:
  /**
   * Calculates the current paragraph's dominant writing direction.
   * Typically, members should use current_paragraph_ltr_ instead.
   */
  bool CurrentParagraphIsLtr() const;

  /**
   * Returns word indices as measured from resit->RestartRow() = index 0
   * for the reading order of words within a textline given an iterator
   * into the middle of the text line.
   * In addition to non-negative word indices, the following negative values
   * may be inserted:
   *   kMinorRunStart  Start of minor direction text.
   *   kMinorRunEnd    End of minor direction text.
   *   kComplexWord    The previous word contains both left-to-right and
   *                   right-to-left characters and was treated as neutral.
   */
  void CalculateTextlineOrder(bool paragraph_is_ltr,
                              const LTRResultIterator &resit,
                              GenericVectorEqEq<int> *indices) const;
  /** Same as above, but the caller's ssd gets filled in if ssd != nullptr. */
  void CalculateTextlineOrder(bool paragraph_is_ltr,
                              const LTRResultIterator &resit,
                              GenericVector<StrongScriptDirection> *ssd,
                              GenericVectorEqEq<int> *indices) const;

  /**
   * What is the index of the current word in a strict left-to-right reading
   * of the row?
   */
  int LTRWordIndex() const;

  /**
   * Given an iterator pointing at a word, returns the logical reading order
   * of blob indices for the word.
   */
  void CalculateBlobOrder(GenericVector<int> *blob_indices) const;

  /** Precondition: current_paragraph_is_ltr_ is set. */
  void MoveToLogicalStartOfTextline();

  /**
   * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
   * are set.
   */
  void MoveToLogicalStartOfWord();

  /** Are we pointing at the final (reading order) symbol of the word? */
  bool IsAtFinalSymbolOfWord() const;

  /** Are we pointing at the first (reading order) symbol of the word? */
  bool IsAtFirstSymbolOfWord() const;

  /**
   * Append any extra marks that should be appended to this word when printed.
   * Mostly, these are Unicode BiDi control characters.
   */
  void AppendSuffixMarks(STRING *text) const;

  /** Appends the current word in reading order to the given buffer.*/
  void AppendUTF8WordText(STRING *text) const;

  /**
   * Appends the text of the current text line, *assuming this iterator is
   * positioned at the beginning of the text line*  This function
   * updates the iterator to point to the first position past the text line.
   * Each textline is terminated in a single newline character.
   * If the textline ends a paragraph, it gets a second terminal newline.
   */
  void IterateAndAppendUTF8TextlineText(STRING *text);

  /**
   * Appends the text of the current paragraph in reading order
   * to the given buffer.
   * Each textline is terminated in a single newline character, and the
   * paragraph gets an extra newline at the end.
   */
  void AppendUTF8ParagraphText(STRING *text) const;

  /** Returns whether the bidi_debug flag is set to at least min_level. */
  bool BidiDebug(int min_level) const;

  bool current_paragraph_is_ltr_;

  /**
   * Is the currently pointed-at character at the beginning of
   * a minor-direction run?
   */
  bool at_beginning_of_minor_run_;

  /** Is the currently pointed-at character in a minor-direction sequence? */
  bool in_minor_direction_;

  /**
   * Should detected inter-word spaces be preserved, or "compressed" to a single
   * space character (default behavior).
   */
  bool preserve_interword_spaces_;
};

}  // namespace tesseract.

#endif  // TESSERACT_CCMAIN_RESULT_ITERATOR_H_