tesseract/src/ccmain/paragraphs_internal.h

/**********************************************************************
 * File:        paragraphs_internal.h
 * Description: Paragraph Detection internal data structures.
 * Author:      David Eger
 * Created:     11 March 2011
 *
 * (C) Copyright 2011, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

#include "paragraphs.h"

// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.

class WERD_CHOICE;

namespace tesseract {

// Return whether the given word is likely to be a list item start word.
bool AsciiLikelyListItem(const STRING &word);

// Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);

// Set right word attributes given either a unicharset and werd or a utf8
// string.
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
                         const STRING &utf8,
                         bool *is_list, bool *starts_idea, bool *ends_idea);

// Set left word attributes given either a unicharset and werd or a utf8 string.
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
                        const STRING &utf8,
                        bool *is_list, bool *starts_idea, bool *ends_idea);

enum LineType {
  LT_START = 'S',     // First line of a paragraph.
  LT_BODY = 'C',      // Continuation line of a paragraph.
  LT_UNKNOWN = 'U',   // No clues.
  LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
};

// The first paragraph in a page of body text is often un-indented.
// This is a typographic convention which is common to indicate either that:
// (1) The paragraph is the continuation of a previous paragraph, or
// (2) The paragraph is the first paragraph in a chapter.
//
// I refer to such paragraphs as "crown"s, and the output of the paragraph
// detection algorithm attempts to give them the same paragraph model as
// the rest of the body text.
//
// Nonetheless, while building hypotheses, it is useful to mark the lines
// of crown paragraphs temporarily as crowns, either aligned left or right.
extern const ParagraphModel *kCrownLeft;
extern const ParagraphModel *kCrownRight;

inline bool StrongModel(const ParagraphModel *model) {
  return model != nullptr && model != kCrownLeft && model != kCrownRight;
}

struct LineHypothesis {
  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
  LineHypothesis(LineType line_type, const ParagraphModel *m)
      : ty(line_type), model(m) {}
  LineHypothesis(const LineHypothesis &other)
      : ty(other.ty), model(other.model) {}

  bool operator==(const LineHypothesis &other) const {
    return ty == other.ty && model == other.model;
  }

  LineType ty;
  const ParagraphModel *model;
};

class ParagraphTheory;  // Forward Declaration

using SetOfModels = GenericVectorEqEq<const ParagraphModel *>;

// Row Scratch Registers are data generated by the paragraph detection
// algorithm based on a RowInfo input.
class RowScratchRegisters {
 public:
  // We presume row will outlive us.
  void Init(const RowInfo &row);

  LineType GetLineType() const;

  LineType GetLineType(const ParagraphModel *model) const;

  // Mark this as a start line type, sans model.  This is useful for the
  // initial marking of probable body lines or paragraph start lines.
  void SetStartLine();

  // Mark this as a body line type, sans model.  This is useful for the
  // initial marking of probably body lines or paragraph start lines.
  void SetBodyLine();

  // Record that this row fits as a paragraph start line in the given model,
  void AddStartLine(const ParagraphModel *model);
  // Record that this row fits as a paragraph body line in the given model,
  void AddBodyLine(const ParagraphModel *model);

  // Clear all hypotheses about this line.
  void SetUnknown() { hypotheses_.truncate(0); }

  // Append all hypotheses of strong models that match this row as a start.
  void StartHypotheses(SetOfModels *models) const;

  // Append all hypotheses of strong models matching this row.
  void StrongHypotheses(SetOfModels *models) const;

  // Append all hypotheses for this row.
  void NonNullHypotheses(SetOfModels *models) const;

  // Discard any hypotheses whose model is not in the given list.
  void DiscardNonMatchingHypotheses(const SetOfModels &models);

  // If we have only one hypothesis and that is that this line is a paragraph
  // start line of a certain model, return that model.  Else return nullptr.
  const ParagraphModel *UniqueStartHypothesis() const;

  // If we have only one hypothesis and that is that this line is a paragraph
  // body line of a certain model, return that model.  Else return nullptr.
  const ParagraphModel *UniqueBodyHypothesis() const;

  // Return the indentation for the side opposite of the aligned side.
  int OffsideIndent(tesseract::ParagraphJustification just) const {
    switch (just) {
      case tesseract::JUSTIFICATION_RIGHT: return lindent_;
      case tesseract::JUSTIFICATION_LEFT: return rindent_;
      default: return lindent_ > rindent_ ? lindent_ : rindent_;
    }
  }

  // Return the indentation for the side the text is aligned to.
  int AlignsideIndent(tesseract::ParagraphJustification just) const {
    switch (just) {
      case tesseract::JUSTIFICATION_RIGHT: return rindent_;
      case tesseract::JUSTIFICATION_LEFT: return lindent_;
      default: return lindent_ > rindent_ ? lindent_ : rindent_;
    }
  }

  // Append header fields to a vector of row headings.
  static void AppendDebugHeaderFields(GenericVector<STRING> *header);

  // Append data for this row to a vector of debug strings.
  void AppendDebugInfo(const ParagraphTheory &theory,
                       GenericVector<STRING> *dbg) const;

  const RowInfo *ri_;

  // These four constants form a horizontal box model for the white space
  // on the edges of each line.  At each point in the algorithm, the following
  // shall hold:
  //   ri_->pix_ldistance = lmargin_ + lindent_
  //   ri_->pix_rdistance = rindent_ + rmargin_
  int lmargin_;
  int lindent_;
  int rindent_;
  int rmargin_;

 private:
  // Hypotheses of either LT_START or LT_BODY
  GenericVectorEqEq<LineHypothesis> hypotheses_;
};

// A collection of convenience functions for wrapping the set of
// Paragraph Models we believe correctly model the paragraphs in the image.
class ParagraphTheory {
 public:
  // We presume models will outlive us, and that models will take ownership
  // of any ParagraphModel *'s we add.
  explicit ParagraphTheory(GenericVector<ParagraphModel *> *models)
      : models_(models) {}
  GenericVector<ParagraphModel *> &models() { return *models_; }
  const GenericVector<ParagraphModel *> &models() const { return *models_; }

  // Return an existing model if one that is Comparable() can be found.
  // Else, allocate a new copy of model to save and return a pointer to it.
  const ParagraphModel *AddModel(const ParagraphModel &model);

  // Discard any models we've made that are not in the list of used models.
  void DiscardUnusedModels(const SetOfModels &used_models);

  // Return the set of all non-centered models.
  void NonCenteredModels(SetOfModels *models);

  // If any of the non-centered paragraph models we know about fit
  // rows[start, end), return it.  Else nullptr.
  const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
                             int start, int end) const;

  int IndexOf(const ParagraphModel *model) const;

 private:
  GenericVector<ParagraphModel *> *models_;
  GenericVectorEqEq<ParagraphModel *> models_we_added_;
};

bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
                    int row, const ParagraphModel *model);
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
                   int row, const ParagraphModel *model);
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
                     int a, int b, const ParagraphModel *model);

// A class for smearing Paragraph Model hypotheses to surrounding rows.
// The idea here is that StrongEvidenceClassify first marks only exceedingly
// obvious start and body rows and constructs models of them.  Thereafter,
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
// were too short to have much confidence about, but which fit the models we've
// constructed perfectly and which we ought to mark.  This class is used to
// "smear" our models over the text.
class ParagraphModelSmearer {
 public:
  ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
                        int row_start, int row_end,
                        ParagraphTheory *theory);

  // Smear forward paragraph models from existing row markings to subsequent
  // text lines if they fit, and mark any thereafter still unmodeled rows
  // with any model in the theory that fits them.
  void Smear();

 private:
  // Record in open_models_ for rows [start_row, end_row) the list of models
  // currently open at each row.
  // A model is still open in a row if some previous row has said model as a
  // start hypothesis, and all rows since (including this row) would fit as
  // either a body or start line in that model.
  void CalculateOpenModels(int row_start, int row_end);

  SetOfModels &OpenModels(int row) {
    return open_models_[row - row_start_ + 1];
  }

  ParagraphTheory *theory_;
  GenericVector<RowScratchRegisters> *rows_;
  int row_start_;
  int row_end_;

  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
  //
  // open_models_:  Contains models which there was an active (open) paragraph
  //                as of the previous line and for which the left and right
  //                indents admit the possibility that this text line continues
  //                to fit the same model.
  // TODO(eger): Think about whether we can get rid of "Open" models and just
  //   use the current hypotheses on RowScratchRegisters.
  GenericVector<SetOfModels> open_models_;
};

// Clear all hypotheses about lines [start, end) and reset the margins to the
// percentile (0..100) value of the left and right row edges for this run of
// rows.
void RecomputeMarginsAndClearHypotheses(
    GenericVector<RowScratchRegisters> *rows, int start, int end,
    int percentile);

// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
                   int row_start, int row_end);

// Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
                           const RowScratchRegisters &after,
                           tesseract::ParagraphJustification justification);

// Return whether the first word on the after line can fit in the space at
// the end of the before line (not knowing the text alignment).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
                           const RowScratchRegisters &after);

// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
                  int start, int end, const ParagraphModel *model);

// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(
    GenericVector<PARA *> *row_owners,
    PARA_LIST *paragraphs);

}  // namespace
#endif  // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`/**********************************************************************`
fix filenames in comments 2017-07-03 05:35:47 +08:00			`* File: paragraphs_internal.h`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`* Description: Paragraph Detection internal data structures.`
			`* Author: David Eger`
			`* Created: 11 March 2011`
			`*`
			`* (C) Copyright 2011, Google Inc.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`**********************************************************************/`

			`#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_`
			`#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_`

			`#include "paragraphs.h"`

			`// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS`
			`// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.`

			`class WERD_CHOICE;`

			`namespace tesseract {`

			`// Return whether the given word is likely to be a list item start word.`
			`bool AsciiLikelyListItem(const STRING &word);`

			`// Return the first Unicode Codepoint from werd[pos].`
			`int UnicodeFor(const UNICHARSET u, const WERD_CHOICE werd, int pos);`

			`// Set right word attributes given either a unicharset and werd or a utf8`
			`// string.`
			`void RightWordAttributes(const UNICHARSET unicharset, const WERD_CHOICE werd,`
			`const STRING &utf8,`
			`bool is_list, bool starts_idea, bool *ends_idea);`

			`// Set left word attributes given either a unicharset and werd or a utf8 string.`
			`void LeftWordAttributes(const UNICHARSET unicharset, const WERD_CHOICE werd,`
			`const STRING &utf8,`
			`bool is_list, bool starts_idea, bool *ends_idea);`

			`enum LineType {`
			`LT_START = 'S', // First line of a paragraph.`
			`LT_BODY = 'C', // Continuation line of a paragraph.`
			`LT_UNKNOWN = 'U', // No clues.`
			`LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.`
			`};`

			`// The first paragraph in a page of body text is often un-indented.`
			`// This is a typographic convention which is common to indicate either that:`
			`// (1) The paragraph is the continuation of a previous paragraph, or`
			`// (2) The paragraph is the first paragraph in a chapter.`
			`//`
			`// I refer to such paragraphs as "crown"s, and the output of the paragraph`
			`// detection algorithm attempts to give them the same paragraph model as`
			`// the rest of the body text.`
			`//`
			`// Nonetheless, while building hypotheses, it is useful to mark the lines`
			`// of crown paragraphs temporarily as crowns, either aligned left or right.`
			`extern const ParagraphModel *kCrownLeft;`
			`extern const ParagraphModel *kCrownRight;`

			`inline bool StrongModel(const ParagraphModel *model) {`
ccmain: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 00:19:35 +08:00			`return model != nullptr && model != kCrownLeft && model != kCrownRight;`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`}`

			`struct LineHypothesis {`
ccmain: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 00:19:35 +08:00			`LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`LineHypothesis(LineType line_type, const ParagraphModel *m)`
			`: ty(line_type), model(m) {}`
			`LineHypothesis(const LineHypothesis &other)`
			`: ty(other.ty), model(other.model) {}`

			`bool operator==(const LineHypothesis &other) const {`
			`return ty == other.ty && model == other.model;`
			`}`

			`LineType ty;`
			`const ParagraphModel *model;`
			`};`

			`class ParagraphTheory; // Forward Declaration`

Use using instead of typedef. Reason: https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rt-using 2018-05-20 06:31:03 +08:00			`using SetOfModels = GenericVectorEqEq<const ParagraphModel *>;`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00
			`// Row Scratch Registers are data generated by the paragraph detection`
			`// algorithm based on a RowInfo input.`
			`class RowScratchRegisters {`
			`public:`
			`// We presume row will outlive us.`
			`void Init(const RowInfo &row);`

			`LineType GetLineType() const;`

			`LineType GetLineType(const ParagraphModel *model) const;`

			`// Mark this as a start line type, sans model. This is useful for the`
			`// initial marking of probable body lines or paragraph start lines.`
			`void SetStartLine();`

			`// Mark this as a body line type, sans model. This is useful for the`
			`// initial marking of probably body lines or paragraph start lines.`
			`void SetBodyLine();`

			`// Record that this row fits as a paragraph start line in the given model,`
			`void AddStartLine(const ParagraphModel *model);`
			`// Record that this row fits as a paragraph body line in the given model,`
			`void AddBodyLine(const ParagraphModel *model);`

			`// Clear all hypotheses about this line.`
			`void SetUnknown() { hypotheses_.truncate(0); }`

			`// Append all hypotheses of strong models that match this row as a start.`
			`void StartHypotheses(SetOfModels *models) const;`

			`// Append all hypotheses of strong models matching this row.`
			`void StrongHypotheses(SetOfModels *models) const;`

			`// Append all hypotheses for this row.`
			`void NonNullHypotheses(SetOfModels *models) const;`

			`// Discard any hypotheses whose model is not in the given list.`
			`void DiscardNonMatchingHypotheses(const SetOfModels &models);`

			`// If we have only one hypothesis and that is that this line is a paragraph`
ccmain: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 00:19:35 +08:00			`// start line of a certain model, return that model. Else return nullptr.`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`const ParagraphModel *UniqueStartHypothesis() const;`

			`// If we have only one hypothesis and that is that this line is a paragraph`
ccmain: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 00:19:35 +08:00			`// body line of a certain model, return that model. Else return nullptr.`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`const ParagraphModel *UniqueBodyHypothesis() const;`

			`// Return the indentation for the side opposite of the aligned side.`
			`int OffsideIndent(tesseract::ParagraphJustification just) const {`
			`switch (just) {`
			`case tesseract::JUSTIFICATION_RIGHT: return lindent_;`
			`case tesseract::JUSTIFICATION_LEFT: return rindent_;`
			`default: return lindent_ > rindent_ ? lindent_ : rindent_;`
			`}`
			`}`

			`// Return the indentation for the side the text is aligned to.`
			`int AlignsideIndent(tesseract::ParagraphJustification just) const {`
			`switch (just) {`
			`case tesseract::JUSTIFICATION_RIGHT: return rindent_;`
			`case tesseract::JUSTIFICATION_LEFT: return lindent_;`
			`default: return lindent_ > rindent_ ? lindent_ : rindent_;`
			`}`
			`}`

			`// Append header fields to a vector of row headings.`
			`static void AppendDebugHeaderFields(GenericVector<STRING> *header);`

			`// Append data for this row to a vector of debug strings.`
			`void AppendDebugInfo(const ParagraphTheory &theory,`
			`GenericVector<STRING> *dbg) const;`

			`const RowInfo *ri_;`

			`// These four constants form a horizontal box model for the white space`
			`// on the edges of each line. At each point in the algorithm, the following`
			`// shall hold:`
			`// ri_->pix_ldistance = lmargin_ + lindent_`
			`// ri_->pix_rdistance = rindent_ + rmargin_`
			`int lmargin_;`
			`int lindent_;`
			`int rindent_;`
			`int rmargin_;`

			`private:`
			`// Hypotheses of either LT_START or LT_BODY`
			`GenericVectorEqEq<LineHypothesis> hypotheses_;`
			`};`

			`// A collection of convenience functions for wrapping the set of`
			`// Paragraph Models we believe correctly model the paragraphs in the image.`
			`class ParagraphTheory {`
			`public:`
			`// We presume models will outlive us, and that models will take ownership`
			`// of any ParagraphModel *'s we add.`
			`explicit ParagraphTheory(GenericVector<ParagraphModel > models)`
			`: models_(models) {}`
			`GenericVector<ParagraphModel > &models() { return models_; }`
			`const GenericVector<ParagraphModel > &models() const { return models_; }`

			`// Return an existing model if one that is Comparable() can be found.`
			`// Else, allocate a new copy of model to save and return a pointer to it.`
			`const ParagraphModel *AddModel(const ParagraphModel &model);`

			`// Discard any models we've made that are not in the list of used models.`
			`void DiscardUnusedModels(const SetOfModels &used_models);`

			`// Return the set of all non-centered models.`
			`void NonCenteredModels(SetOfModels *models);`

			`// If any of the non-centered paragraph models we know about fit`
ccmain: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 00:19:35 +08:00			`// rows[start, end), return it. Else nullptr.`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`const ParagraphModel Fits(const GenericVector<RowScratchRegisters> rows,`
			`int start, int end) const;`

			`int IndexOf(const ParagraphModel *model) const;`

			`private:`
			`GenericVector<ParagraphModel > models_;`
			`GenericVectorEqEq<ParagraphModel *> models_we_added_;`
			`};`

			`bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,`
			`int row, const ParagraphModel *model);`
			`bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,`
			`int row, const ParagraphModel *model);`
			`bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,`
			`int a, int b, const ParagraphModel *model);`

			`// A class for smearing Paragraph Model hypotheses to surrounding rows.`
			`// The idea here is that StrongEvidenceClassify first marks only exceedingly`
			`// obvious start and body rows and constructs models of them. Thereafter,`
			`// we may have left over unmarked lines (mostly end-of-paragraph lines) which`
			`// were too short to have much confidence about, but which fit the models we've`
			`// constructed perfectly and which we ought to mark. This class is used to`
			`// "smear" our models over the text.`
			`class ParagraphModelSmearer {`
			`public:`
			`ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,`
			`int row_start, int row_end,`
			`ParagraphTheory *theory);`

			`// Smear forward paragraph models from existing row markings to subsequent`
			`// text lines if they fit, and mark any thereafter still unmodeled rows`
			`// with any model in the theory that fits them.`
			`void Smear();`

			`private:`
			`// Record in open_models_ for rows [start_row, end_row) the list of models`
			`// currently open at each row.`
			`// A model is still open in a row if some previous row has said model as a`
			`// start hypothesis, and all rows since (including this row) would fit as`
			`// either a body or start line in that model.`
			`void CalculateOpenModels(int row_start, int row_end);`

			`SetOfModels &OpenModels(int row) {`
			`return open_models_[row - row_start_ + 1];`
			`}`

			`ParagraphTheory *theory_;`
			`GenericVector<RowScratchRegisters> *rows_;`
			`int row_start_;`
			`int row_end_;`

			`// open_models_ corresponds to rows[start_row_ - 1, end_row_]`
			`//`
			`// open_models_: Contains models which there was an active (open) paragraph`
			`// as of the previous line and for which the left and right`
			`// indents admit the possibility that this text line continues`
			`// to fit the same model.`
			`// TODO(eger): Think about whether we can get rid of "Open" models and just`
			`// use the current hypotheses on RowScratchRegisters.`
			`GenericVector<SetOfModels> open_models_;`
			`};`

			`// Clear all hypotheses about lines [start, end) and reset the margins to the`
			`// percentile (0..100) value of the left and right row edges for this run of`
			`// rows.`
			`void RecomputeMarginsAndClearHypotheses(`
			`GenericVector<RowScratchRegisters> *rows, int start, int end,`
			`int percentile);`

Fixed typos and improved comments git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@753 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-09-21 23:31:20 +08:00			`// Return the median inter-word space in rows[row_start, row_end).`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,`
			`int row_start, int row_end);`

			`// Return whether the first word on the after line can fit in the space at`
			`// the end of the before line (knowing which way the text is aligned and read).`
			`bool FirstWordWouldHaveFit(const RowScratchRegisters &before,`
			`const RowScratchRegisters &after,`
			`tesseract::ParagraphJustification justification);`

			`// Return whether the first word on the after line can fit in the space at`
			`// the end of the before line (not knowing the text alignment).`
			`bool FirstWordWouldHaveFit(const RowScratchRegisters &before,`
			`const RowScratchRegisters &after);`

			`// Do rows[start, end) form a single instance of the given paragraph model?`
			`bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,`
			`int start, int end, const ParagraphModel *model);`

ccmain: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 00:19:35 +08:00			`// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`// normalize each row_owner to point to an actual PARA, and output the`
			`// paragraphs in order onto paragraphs.`
			`void CanonicalizeDetectionResults(`
			`GenericVector<PARA > row_owners,`
			`PARA_LIST *paragraphs);`

			`} // namespace`
			`#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_`