tesseract/ccmain/paragraphs.h

/**********************************************************************
 * File:        paragraphs.h
 * Description: Paragraph Detection data structures.
 * Author:      David Eger
 * Created:     25 February 2011
 *
 * (C) Copyright 2011, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_H_

#include "rect.h"
#include "ocrpara.h"
#include "genericvector.h"
#include "strngs.h"


class WERD;
class UNICHARSET;

namespace tesseract {

class MutableIterator;

// This structure captures all information needed about a text line for the
// purposes of paragraph detection.  It is meant to be exceedingly light-weight
// so that we can easily test paragraph detection independent of the rest of
// Tesseract.
class RowInfo {
 public:
  // Constant data derived from Tesseract output.
  STRING text;        // the full UTF-8 text of the line.
  bool ltr;           // whether the majority of the text is left-to-right
                      // TODO(eger) make this more fine-grained.

  bool has_leaders;   // does the line contain leader dots (.....)?
  bool has_drop_cap;  // does the line have a drop cap?
  int pix_ldistance;  // distance to the left pblock boundary in pixels
  int pix_rdistance;  // distance to the right pblock boundary in pixels
  float pix_xheight;  // guessed xheight for the line
  int average_interword_space; // average space between words in pixels.

  int num_words;
  TBOX lword_box;     // in normalized (horiz text rows) space
  TBOX rword_box;     // in normalized (horiz text rows) space

  STRING lword_text;   // the UTF-8 text of the leftmost werd
  STRING rword_text;   // the UTF-8 text of the rightmost werd

  //   The text of a paragraph typically starts with the start of an idea and
  // ends with the end of an idea.  Here we define paragraph as something that
  // may have a first line indent and a body indent which may be different.
  // Typical words that start an idea are:
  //   1. Words in western scripts that start with
  //      a capital letter, for example "The"
  //   2. Bulleted or numbered list items, for
  //      example "2."
  // Typical words which end an idea are words ending in punctuation marks. In
  // this vocabulary, each list item is represented as a paragraph.
  bool lword_indicates_list_item;
  bool lword_likely_starts_idea;
  bool lword_likely_ends_idea;

  bool rword_indicates_list_item;
  bool rword_likely_starts_idea;
  bool rword_likely_ends_idea;
};

// Main entry point for Paragraph Detection Algorithm.
//
// Given a set of equally spaced textlines (described by row_infos),
// Split them into paragraphs.  See http://goto/paragraphstalk
//
// Output:
//   row_owners - one pointer for each row, to the paragraph it belongs to.
//   paragraphs - this is the actual list of PARA objects.
//   models - the list of paragraph models referenced by the PARA objects.
//            caller is responsible for deleting the models.
void DetectParagraphs(int debug_level,
                      GenericVector<RowInfo> *row_infos,
                      GenericVector<PARA *> *row_owners,
                      PARA_LIST *paragraphs,
                      GenericVector<ParagraphModel *> *models);

// Given a MutableIterator to the start of a block, run DetectParagraphs on
// that block and commit the results to the underlying ROW and BLOCK structs,
// saving the ParagraphModels in models.  Caller owns the models.
// We use unicharset during the function to answer questions such as "is the
// first letter of this word upper case?"
void DetectParagraphs(int debug_level,
                      bool after_text_recognition,
                      const MutableIterator *block_start,
                      GenericVector<ParagraphModel *> *models);

}  // namespace

#endif  // TESSERACT_CCMAIN_PARAGRAPHS_H_
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`/**********************************************************************`
			`* File: paragraphs.h`
			`* Description: Paragraph Detection data structures.`
			`* Author: David Eger`
			`* Created: 25 February 2011`
			`*`
			`* (C) Copyright 2011, Google Inc.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`**********************************************************************/`

			`#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_`
			`#define TESSERACT_CCMAIN_PARAGRAPHS_H_`

			`#include "rect.h"`
			`#include "ocrpara.h"`
			`#include "genericvector.h"`
			`#include "strngs.h"`


			`class WERD;`
			`class UNICHARSET;`

			`namespace tesseract {`

			`class MutableIterator;`

			`// This structure captures all information needed about a text line for the`
			`// purposes of paragraph detection. It is meant to be exceedingly light-weight`
			`// so that we can easily test paragraph detection independent of the rest of`
			`// Tesseract.`
			`class RowInfo {`
			`public:`
			`// Constant data derived from Tesseract output.`
			`STRING text; // the full UTF-8 text of the line.`
			`bool ltr; // whether the majority of the text is left-to-right`
			`// TODO(eger) make this more fine-grained.`

			`bool has_leaders; // does the line contain leader dots (.....)?`
			`bool has_drop_cap; // does the line have a drop cap?`
			`int pix_ldistance; // distance to the left pblock boundary in pixels`
			`int pix_rdistance; // distance to the right pblock boundary in pixels`
			`float pix_xheight; // guessed xheight for the line`
			`int average_interword_space; // average space between words in pixels.`

			`int num_words;`
			`TBOX lword_box; // in normalized (horiz text rows) space`
			`TBOX rword_box; // in normalized (horiz text rows) space`

			`STRING lword_text; // the UTF-8 text of the leftmost werd`
			`STRING rword_text; // the UTF-8 text of the rightmost werd`

			`// The text of a paragraph typically starts with the start of an idea and`
			`// ends with the end of an idea. Here we define paragraph as something that`
			`// may have a first line indent and a body indent which may be different.`
			`// Typical words that start an idea are:`
			`// 1. Words in western scripts that start with`
			`// a capital letter, for example "The"`
			`// 2. Bulleted or numbered list items, for`
			`// example "2."`
			`// Typical words which end an idea are words ending in punctuation marks. In`
			`// this vocabulary, each list item is represented as a paragraph.`
			`bool lword_indicates_list_item;`
			`bool lword_likely_starts_idea;`
			`bool lword_likely_ends_idea;`

			`bool rword_indicates_list_item;`
			`bool rword_likely_starts_idea;`
			`bool rword_likely_ends_idea;`
			`};`

			`// Main entry point for Paragraph Detection Algorithm.`
			`//`
			`// Given a set of equally spaced textlines (described by row_infos),`
			`// Split them into paragraphs. See http://goto/paragraphstalk`
			`//`
			`// Output:`
			`// row_owners - one pointer for each row, to the paragraph it belongs to.`
			`// paragraphs - this is the actual list of PARA objects.`
			`// models - the list of paragraph models referenced by the PARA objects.`
			`// caller is responsible for deleting the models.`
			`void DetectParagraphs(int debug_level,`
			`GenericVector<RowInfo> *row_infos,`
			`GenericVector<PARA > row_owners,`
			`PARA_LIST *paragraphs,`
			`GenericVector<ParagraphModel > models);`

			`// Given a MutableIterator to the start of a block, run DetectParagraphs on`
			`// that block and commit the results to the underlying ROW and BLOCK structs,`
			`// saving the ParagraphModels in models. Caller owns the models.`
			`// We use unicharset during the function to answer questions such as "is the`
			`// first letter of this word upper case?"`
			`void DetectParagraphs(int debug_level,`
Provide better paragraph segmentation without having to run fully automatic layout analysis. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@725 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-05-10 08:03:34 +08:00			`bool after_text_recognition,`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Added paragraph detection in layout analysis/post OCR, Fixed inconsistent xheight during training and over-chopping, Added simultaneous multi-language capability, Refactored top-level word recognition module, Fixed problems with internally scaled images git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 10:59:49 +08:00			`const MutableIterator *block_start,`
			`GenericVector<ParagraphModel > models);`

			`} // namespace`

			`#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_`