/////////////////////////////////////////////////////////////////////// // File: pageiterator.h // Description: Iterator for tesseract page structure that avoids using // tesseract internal data structures. // Author: Ray Smith // Created: Fri Feb 26 11:01:06 PST 2010 // // (C) Copyright 2010, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_API_PAGEITERATOR_H__ #define TESSERACT_API_PAGEITERATOR_H__ #include "apitypes.h" class C_BLOB_IT; class PBLOB_IT; class PAGE_RES; class PAGE_RES_IT; class WERD; struct Pix; namespace tesseract { class Tesseract; // Class to iterate over tesseract page structure, providing access to all // levels of the page hierarchy, without including any tesseract headers or // having to handle any tesseract structures. // WARNING! This class points to data held within the TessBaseAPI class, and // therefore can only be used while the TessBaseAPI class still exists and // has not been subjected to a call of Init, SetImage, Recognize, Clear, End // DetectOS, or anything else that changes the internal PAGE_RES. // See apitypes.h for the definition of PageIteratorLevel. // See also ResultIterator, derived from PageIterator, which adds in the // ability to access OCR output with text-specific methods. class PageIterator { public: // page_res and tesseract come directly from the BaseAPI. // The rectangle parameters are copied indirectly from the Thresholder, // via the BaseAPI. They represent the coordinates of some rectangle in an // original image (in top-left-origin coordinates) and therefore the top-left // needs to be added to any output boxes in order to specify coordinates // in the original image. See TessBaseAPI::SetRectangle. // The scale and scaled_yres are in case the Thresholder scaled the image // rectangle prior to thresholding. Any coordinates in tesseract's image // must be divided by scale before adding (rect_left, rect_top). // The scaled_yres indicates the effective resolution of the binary image // that tesseract has been given by the Thresholder. // After the constructor, Begin has already been called. PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height); virtual ~PageIterator(); // Page/ResultIterators may be copied! This makes it possible to iterate over // all the objects at a lower level, while maintaining an iterator to // objects at a higher level. These constructors DO NOT CALL Begin, so // iterations will continue from the location of src. PageIterator(const PageIterator& src); const PageIterator& operator=(const PageIterator& src); // ============= Moving around within the page ============. // Moves the iterator to point to the start of the page to begin an iteration. void Begin(); // Moves to the start of the next object at the given level in the // page hierarchy, and returns false if the end of the page was reached. // NOTE that RIL_SYMBOL will skip non-text blocks, but all other // PageIteratorLevel level values will visit each non-text block once. // Think of non text blocks as containing a single para, with a single line, // with a single imaginary word. // Calls to Next with different levels may be freely intermixed. // This function iterates words in right-to-left scripts correctly, if // the appropriate language has been loaded into Tesseract. bool Next(PageIteratorLevel level); // Returns true if the iterator is at the start of an object at the given // level. Possible uses include determining if a call to Next(RIL_WORD) // moved to the start of a RIL_PARA. bool IsAtBeginningOf(PageIteratorLevel level) const; // Returns whether the iterator is positioned at the last element in a // given level. (e.g. the last word in a line, the last line in a block) bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const; // ============= Accessing data ==============. // Coordinate system: // Integer coordinates are at the cracks between the pixels. // The top-left corner of the top-left pixel in the image is at (0,0). // The bottom-right corner of the bottom-right pixel in the image is at // (width, height). // Every bounding box goes from the top-left of the top-left contained // pixel to the bottom-right of the bottom-right contained pixel, so // the bounding box of the single top-left pixel in the image is: // (0,0)->(1,1). // If an image rectangle has been set in the API, then returned coordinates // relate to the original (full) image, rather than the rectangle. // Returns the bounding rectangle of the current object at the given level. // See comment on coordinate system above. // Returns false if there is no such object at the current position. // The returned bounding box is guaranteed to match the size and position // of the image returned by GetBinaryImage, but may clip foreground pixels // from a grey image. The padding argument to GetImage can be used to expand // the image to include more foreground pixels. See GetImage below. bool BoundingBox(PageIteratorLevel level, int* left, int* top, int* right, int* bottom) const; // Returns the type of the current block. See apitypes.h for PolyBlockType. PolyBlockType BlockType() const; // Returns a binary image of the current object at the given level. // The position and size match the return from BoundingBox. // Use pixDestroy to delete the image after use. Pix* GetBinaryImage(PageIteratorLevel level) const; // Returns an image of the current object at the given level in greyscale // if available in the input. To guarantee a binary image use BinaryImage. // NOTE that in order to give the best possible image, the bounds are // expanded slightly over the binary connected component, by the supplied // padding, so the top-left position of the returned image is returned // in (left,top). These will most likely not match the coordinates // returned by BoundingBox. // Use pixDestroy to delete the image after use. Pix* GetImage(PageIteratorLevel level, int padding, int* left, int* top) const; // Returns the baseline of the current object at the given level. // The baseline is the line that passes through (x1, y1) and (x2, y2). // WARNING: with vertical text, baselines may be vertical! // Returns false if there is no baseline at the current position. bool Baseline(PageIteratorLevel level, int* x1, int* y1, int* x2, int* y2) const; // Returns orientation for the block the iterator points to. // orientation, writing_direction, textline_order: see publictypes.h // deskew_angle: after rotating the block so the text orientation is // upright, how many radians does one have to rotate the // block anti-clockwise for it to be level? // -Pi/4 <= deskew_angle <= Pi/4 void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle); protected: // Sets up the internal data for iterating the blobs of a new word, then // moves the iterator to the given offset. void BeginWord(int offset); // Pointer to the page_res owned by the API. PAGE_RES* page_res_; // Pointer to the Tesseract object owned by the API. Tesseract* tesseract_; // The iterator to the page_res_. Owned by this ResultIterator. // A pointer just to avoid dragging in Tesseract includes. PAGE_RES_IT* it_; // The current input WERD being iterated. If there is an output from OCR, // then word_ is NULL. Owned by the API. WERD* word_; // The length of the current word_. int word_length_; // The current blob index within the word. int blob_index_; // Iterator to the blobs within the word. If NULL, then we are iterating // OCR results in the box_word. // Owned by this ResultIterator. C_BLOB_IT* cblob_it_; // Parameters saved from the Thresholder. Needed to rebuild coordinates. int scale_; int scaled_yres_; int rect_left_; int rect_top_; int rect_width_; int rect_height_; }; } // namespace tesseract. #endif // TESSERACT_API_PAGEITERATOR_H__