mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-23 19:13:00 +08:00
0e868ef377
Tha, Vie, Kan, Tel etc. There is a new overlap detector that detects when diacritics cause a big increase in textline overlap. In such cases, diacritics from overlap regions are kept separate from layout analysis completely, allowing textline formation to happen without them. The diacritics are then assigned to 0, 1 or 2 close words at the end of layout analysis, using and modifying an old noise detection data path. The stored diacritics are used or not during recognition according to the character classifier's liking for them.
365 lines
15 KiB
C++
365 lines
15 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: pageiterator.h
|
|
// Description: Iterator for tesseract page structure that avoids using
|
|
// tesseract internal data structures.
|
|
// Author: Ray Smith
|
|
// Created: Fri Feb 26 11:01:06 PST 2010
|
|
//
|
|
// (C) Copyright 2010, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
|
|
#define TESSERACT_CCMAIN_PAGEITERATOR_H__
|
|
|
|
#include "publictypes.h"
|
|
#include "platform.h"
|
|
|
|
struct BlamerBundle;
|
|
class C_BLOB_IT;
|
|
class PAGE_RES;
|
|
class PAGE_RES_IT;
|
|
class WERD;
|
|
struct Pix;
|
|
struct Pta;
|
|
|
|
namespace tesseract {
|
|
|
|
class Tesseract;
|
|
|
|
/**
|
|
* Class to iterate over tesseract page structure, providing access to all
|
|
* levels of the page hierarchy, without including any tesseract headers or
|
|
* having to handle any tesseract structures.
|
|
* WARNING! This class points to data held within the TessBaseAPI class, and
|
|
* therefore can only be used while the TessBaseAPI class still exists and
|
|
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
|
* DetectOS, or anything else that changes the internal PAGE_RES.
|
|
* See apitypes.h for the definition of PageIteratorLevel.
|
|
* See also ResultIterator, derived from PageIterator, which adds in the
|
|
* ability to access OCR output with text-specific methods.
|
|
*/
|
|
|
|
class TESS_API PageIterator {
|
|
public:
|
|
/**
|
|
* page_res and tesseract come directly from the BaseAPI.
|
|
* The rectangle parameters are copied indirectly from the Thresholder,
|
|
* via the BaseAPI. They represent the coordinates of some rectangle in an
|
|
* original image (in top-left-origin coordinates) and therefore the top-left
|
|
* needs to be added to any output boxes in order to specify coordinates
|
|
* in the original image. See TessBaseAPI::SetRectangle.
|
|
* The scale and scaled_yres are in case the Thresholder scaled the image
|
|
* rectangle prior to thresholding. Any coordinates in tesseract's image
|
|
* must be divided by scale before adding (rect_left, rect_top).
|
|
* The scaled_yres indicates the effective resolution of the binary image
|
|
* that tesseract has been given by the Thresholder.
|
|
* After the constructor, Begin has already been called.
|
|
*/
|
|
PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
|
|
int scale, int scaled_yres,
|
|
int rect_left, int rect_top,
|
|
int rect_width, int rect_height);
|
|
virtual ~PageIterator();
|
|
|
|
/**
|
|
* Page/ResultIterators may be copied! This makes it possible to iterate over
|
|
* all the objects at a lower level, while maintaining an iterator to
|
|
* objects at a higher level. These constructors DO NOT CALL Begin, so
|
|
* iterations will continue from the location of src.
|
|
*/
|
|
PageIterator(const PageIterator& src);
|
|
const PageIterator& operator=(const PageIterator& src);
|
|
|
|
/** Are we positioned at the same location as other? */
|
|
bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
|
|
|
|
// ============= Moving around within the page ============.
|
|
|
|
/**
|
|
* Moves the iterator to point to the start of the page to begin an
|
|
* iteration.
|
|
*/
|
|
virtual void Begin();
|
|
|
|
/**
|
|
* Moves the iterator to the beginning of the paragraph.
|
|
* This class implements this functionality by moving it to the zero indexed
|
|
* blob of the first (leftmost) word on the first row of the paragraph.
|
|
*/
|
|
virtual void RestartParagraph();
|
|
|
|
/**
|
|
* Return whether this iterator points anywhere in the first textline of a
|
|
* paragraph.
|
|
*/
|
|
bool IsWithinFirstTextlineOfParagraph() const;
|
|
|
|
/**
|
|
* Moves the iterator to the beginning of the text line.
|
|
* This class implements this functionality by moving it to the zero indexed
|
|
* blob of the first (leftmost) word of the row.
|
|
*/
|
|
virtual void RestartRow();
|
|
|
|
/**
|
|
* Moves to the start of the next object at the given level in the
|
|
* page hierarchy, and returns false if the end of the page was reached.
|
|
* NOTE that RIL_SYMBOL will skip non-text blocks, but all other
|
|
* PageIteratorLevel level values will visit each non-text block once.
|
|
* Think of non text blocks as containing a single para, with a single line,
|
|
* with a single imaginary word.
|
|
* Calls to Next with different levels may be freely intermixed.
|
|
* This function iterates words in right-to-left scripts correctly, if
|
|
* the appropriate language has been loaded into Tesseract.
|
|
*/
|
|
virtual bool Next(PageIteratorLevel level);
|
|
|
|
/**
|
|
* Returns true if the iterator is at the start of an object at the given
|
|
* level.
|
|
*
|
|
* For instance, suppose an iterator it is pointed to the first symbol of the
|
|
* first word of the third line of the second paragraph of the first block in
|
|
* a page, then:
|
|
* it.IsAtBeginningOf(RIL_BLOCK) = false
|
|
* it.IsAtBeginningOf(RIL_PARA) = false
|
|
* it.IsAtBeginningOf(RIL_TEXTLINE) = true
|
|
* it.IsAtBeginningOf(RIL_WORD) = true
|
|
* it.IsAtBeginningOf(RIL_SYMBOL) = true
|
|
*/
|
|
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
|
|
|
|
/**
|
|
* Returns whether the iterator is positioned at the last element in a
|
|
* given level. (e.g. the last word in a line, the last line in a block)
|
|
*
|
|
* Here's some two-paragraph example
|
|
* text. It starts off innocuously
|
|
* enough but quickly turns bizarre.
|
|
* The author inserts a cornucopia
|
|
* of words to guard against confused
|
|
* references.
|
|
*
|
|
* Now take an iterator it pointed to the start of "bizarre."
|
|
* it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
|
|
* it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
|
|
* it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
|
|
*/
|
|
virtual bool IsAtFinalElement(PageIteratorLevel level,
|
|
PageIteratorLevel element) const;
|
|
|
|
/**
|
|
* Returns whether this iterator is positioned
|
|
* before other: -1
|
|
* equal to other: 0
|
|
* after other: 1
|
|
*/
|
|
int Cmp(const PageIterator &other) const;
|
|
|
|
// ============= Accessing data ==============.
|
|
// Coordinate system:
|
|
// Integer coordinates are at the cracks between the pixels.
|
|
// The top-left corner of the top-left pixel in the image is at (0,0).
|
|
// The bottom-right corner of the bottom-right pixel in the image is at
|
|
// (width, height).
|
|
// Every bounding box goes from the top-left of the top-left contained
|
|
// pixel to the bottom-right of the bottom-right contained pixel, so
|
|
// the bounding box of the single top-left pixel in the image is:
|
|
// (0,0)->(1,1).
|
|
// If an image rectangle has been set in the API, then returned coordinates
|
|
// relate to the original (full) image, rather than the rectangle.
|
|
|
|
/**
|
|
* Controls what to include in a bounding box. Bounding boxes of all levels
|
|
* between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
|
|
* Between layout analysis and recognition, it isn't known where all
|
|
* diacritics belong, so this control is used to include or exclude some
|
|
* diacritics that are above or below the main body of the word. In most cases
|
|
* where the placement is obvious, and after recognition, it doesn't make as
|
|
* much difference, as the diacritics will already be included in the word.
|
|
*/
|
|
void SetBoundingBoxComponents(bool include_upper_dots,
|
|
bool include_lower_dots) {
|
|
include_upper_dots_ = include_upper_dots;
|
|
include_lower_dots_ = include_lower_dots;
|
|
}
|
|
|
|
/**
|
|
* Returns the bounding rectangle of the current object at the given level.
|
|
* See comment on coordinate system above.
|
|
* Returns false if there is no such object at the current position.
|
|
* The returned bounding box is guaranteed to match the size and position
|
|
* of the image returned by GetBinaryImage, but may clip foreground pixels
|
|
* from a grey image. The padding argument to GetImage can be used to expand
|
|
* the image to include more foreground pixels. See GetImage below.
|
|
*/
|
|
bool BoundingBox(PageIteratorLevel level,
|
|
int* left, int* top, int* right, int* bottom) const;
|
|
bool BoundingBox(PageIteratorLevel level, const int padding,
|
|
int* left, int* top, int* right, int* bottom) const;
|
|
/**
|
|
* Returns the bounding rectangle of the object in a coordinate system of the
|
|
* working image rectangle having its origin at (rect_left_, rect_top_) with
|
|
* respect to the original image and is scaled by a factor scale_.
|
|
*/
|
|
bool BoundingBoxInternal(PageIteratorLevel level,
|
|
int* left, int* top, int* right, int* bottom) const;
|
|
|
|
/** Returns whether there is no object of a given level. */
|
|
bool Empty(PageIteratorLevel level) const;
|
|
|
|
/**
|
|
* Returns the type of the current block. See apitypes.h for
|
|
* PolyBlockType.
|
|
*/
|
|
PolyBlockType BlockType() const;
|
|
|
|
/**
|
|
* Returns the polygon outline of the current block. The returned Pta must
|
|
* be ptaDestroy-ed after use. Note that the returned Pta lists the vertices
|
|
* of the polygon, and the last edge is the line segment between the last
|
|
* point and the first point. NULL will be returned if the iterator is
|
|
* at the end of the document or layout analysis was not used.
|
|
*/
|
|
Pta* BlockPolygon() const;
|
|
|
|
/**
|
|
* Returns a binary image of the current object at the given level.
|
|
* The position and size match the return from BoundingBoxInternal, and so
|
|
* this could be upscaled with respect to the original input image.
|
|
* Use pixDestroy to delete the image after use.
|
|
*/
|
|
Pix* GetBinaryImage(PageIteratorLevel level) const;
|
|
|
|
/**
|
|
* Returns an image of the current object at the given level in greyscale
|
|
* if available in the input. To guarantee a binary image use BinaryImage.
|
|
* NOTE that in order to give the best possible image, the bounds are
|
|
* expanded slightly over the binary connected component, by the supplied
|
|
* padding, so the top-left position of the returned image is returned
|
|
* in (left,top). These will most likely not match the coordinates
|
|
* returned by BoundingBox.
|
|
* If you do not supply an original image, you will get a binary one.
|
|
* Use pixDestroy to delete the image after use.
|
|
*/
|
|
Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
|
|
int* left, int* top) const;
|
|
|
|
/**
|
|
* Returns the baseline of the current object at the given level.
|
|
* The baseline is the line that passes through (x1, y1) and (x2, y2).
|
|
* WARNING: with vertical text, baselines may be vertical!
|
|
* Returns false if there is no baseline at the current position.
|
|
*/
|
|
bool Baseline(PageIteratorLevel level,
|
|
int* x1, int* y1, int* x2, int* y2) const;
|
|
|
|
/**
|
|
* Returns orientation for the block the iterator points to.
|
|
* orientation, writing_direction, textline_order: see publictypes.h
|
|
* deskew_angle: after rotating the block so the text orientation is
|
|
* upright, how many radians does one have to rotate the
|
|
* block anti-clockwise for it to be level?
|
|
* -Pi/4 <= deskew_angle <= Pi/4
|
|
*/
|
|
void Orientation(tesseract::Orientation *orientation,
|
|
tesseract::WritingDirection *writing_direction,
|
|
tesseract::TextlineOrder *textline_order,
|
|
float *deskew_angle) const;
|
|
|
|
/**
|
|
* Returns information about the current paragraph, if available.
|
|
*
|
|
* justification -
|
|
* LEFT if ragged right, or fully justified and script is left-to-right.
|
|
* RIGHT if ragged left, or fully justified and script is right-to-left.
|
|
* unknown if it looks like source code or we have very few lines.
|
|
* is_list_item -
|
|
* true if we believe this is a member of an ordered or unordered list.
|
|
* is_crown -
|
|
* true if the first line of the paragraph is aligned with the other
|
|
* lines of the paragraph even though subsequent paragraphs have first
|
|
* line indents. This typically indicates that this is the continuation
|
|
* of a previous paragraph or that it is the very first paragraph in
|
|
* the chapter.
|
|
* first_line_indent -
|
|
* For LEFT aligned paragraphs, the first text line of paragraphs of
|
|
* this kind are indented this many pixels from the left edge of the
|
|
* rest of the paragraph.
|
|
* for RIGHT aligned paragraphs, the first text line of paragraphs of
|
|
* this kind are indented this many pixels from the right edge of the
|
|
* rest of the paragraph.
|
|
* NOTE 1: This value may be negative.
|
|
* NOTE 2: if *is_crown == true, the first line of this paragraph is
|
|
* actually flush, and first_line_indent is set to the "common"
|
|
* first_line_indent for subsequent paragraphs in this block
|
|
* of text.
|
|
*/
|
|
void ParagraphInfo(tesseract::ParagraphJustification *justification,
|
|
bool *is_list_item,
|
|
bool *is_crown,
|
|
int *first_line_indent) const;
|
|
|
|
// If the current WERD_RES (it_->word()) is not NULL, sets the BlamerBundle
|
|
// of the current word to the given pointer (takes ownership of the pointer)
|
|
// and returns true.
|
|
// Can only be used when iterating on the word level.
|
|
bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
|
|
|
|
protected:
|
|
/**
|
|
* Sets up the internal data for iterating the blobs of a new word, then
|
|
* moves the iterator to the given offset.
|
|
*/
|
|
TESS_LOCAL void BeginWord(int offset);
|
|
|
|
/** Pointer to the page_res owned by the API. */
|
|
PAGE_RES* page_res_;
|
|
/** Pointer to the Tesseract object owned by the API. */
|
|
Tesseract* tesseract_;
|
|
/**
|
|
* The iterator to the page_res_. Owned by this ResultIterator.
|
|
* A pointer just to avoid dragging in Tesseract includes.
|
|
*/
|
|
PAGE_RES_IT* it_;
|
|
/**
|
|
* The current input WERD being iterated. If there is an output from OCR,
|
|
* then word_ is NULL. Owned by the API
|
|
*/
|
|
WERD* word_;
|
|
/** The length of the current word_. */
|
|
int word_length_;
|
|
/** The current blob index within the word. */
|
|
int blob_index_;
|
|
/**
|
|
* Iterator to the blobs within the word. If NULL, then we are iterating
|
|
* OCR results in the box_word.
|
|
* Owned by this ResultIterator.
|
|
*/
|
|
C_BLOB_IT* cblob_it_;
|
|
/** Control over what to include in bounding boxes. */
|
|
bool include_upper_dots_;
|
|
bool include_lower_dots_;
|
|
/** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
|
|
int scale_;
|
|
int scaled_yres_;
|
|
int rect_left_;
|
|
int rect_top_;
|
|
int rect_width_;
|
|
int rect_height_;
|
|
};
|
|
|
|
} // namespace tesseract.
|
|
|
|
#endif // TESSERACT_CCMAIN_PAGEITERATOR_H__
|