Moved ResultIterator/PageIterator to ccmain

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@645 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2012-02-02 02:47:59 +00:00
parent 8225f5b846
commit ef786ad29b
6 changed files with 1104 additions and 484 deletions

View File

@ -1,278 +0,0 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.cpp
// Description: Iterator for tesseract results that avoids using tesseract
// internal data structures
// Author: Ray Smith
// Created: Fri Feb 26 14:32:09 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "resultiterator.h"
#include "allheaders.h"
#include "pageres.h"
#include "tesseractclass.h"
namespace tesseract {
ResultIterator::ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
int rect_width, int rect_height)
: PageIterator(page_res, tesseract, scale, scaled_yres,
rect_left, rect_top, rect_width, rect_height) {
}
ResultIterator::~ResultIterator() {
}
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
if (it_->word() == NULL) return NULL; // Already at the end!
STRING text;
PAGE_RES_IT res_it(*it_);
WERD_CHOICE* best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
switch (level) {
case RIL_BLOCK:
case RIL_PARA:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
text += best_choice->unichar_string();
text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
res_it.forward();
} while (res_it.block() == res_it.prev_block());
break;
case RIL_TEXTLINE:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
text += best_choice->unichar_string();
text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
res_it.forward();
} while (res_it.row() == res_it.prev_row());
break;
case RIL_WORD:
text = best_choice->unichar_string();
break;
case RIL_SYMBOL:
text = tesseract_->unicharset.id_to_unichar(
best_choice->unichar_id(blob_index_));
}
int length = text.length() + 1;
char* result = new char[length];
strncpy(result, text.string(), length);
return result;
}
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float ResultIterator::Confidence(PageIteratorLevel level) const {
if (it_->word() == NULL) return 0.0f; // Already at the end!
float mean_certainty = 0.0f;
int certainty_count = 0;
PAGE_RES_IT res_it(*it_);
WERD_CHOICE* best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
switch (level) {
case RIL_BLOCK:
case RIL_PARA:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.block() == res_it.prev_block());
break;
case RIL_TEXTLINE:
do {
best_choice = res_it.word()->best_choice;
ASSERT_HOST(best_choice != NULL);
mean_certainty += best_choice->certainty();
++certainty_count;
res_it.forward();
} while (res_it.row() == res_it.prev_row());
break;
case RIL_WORD:
mean_certainty += best_choice->certainty();
++certainty_count;
break;
case RIL_SYMBOL:
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
if (choices != NULL) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
for (int blob = 0; blob < blob_index_; ++blob)
blob_choices_it.forward();
BLOB_CHOICE_IT choice_it(blob_choices_it.data());
for (choice_it.mark_cycle_pt();
!choice_it.cycled_list();
choice_it.forward()) {
if (choice_it.data()->unichar_id() ==
best_choice->unichar_id(blob_index_))
break;
}
mean_certainty += choice_it.data()->certainty();
} else {
mean_certainty += best_choice->certainty();
}
++certainty_count;
}
if (certainty_count > 0) {
mean_certainty /= certainty_count;
float confidence = 100 + 5 * mean_certainty;
if (confidence < 0.0f) confidence = 0.0f;
if (confidence > 100.0f) confidence = 100.0f;
return confidence;
}
return 0.0f;
}
// Returns the font attributes of the current word. If iterating at a higher
// level object than words, eg textlines, then this will return the
// attributes of the first word in that textline.
// The actual return value is a string representing a font name. It points
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
// the iterator itself, ie rendered invalid by various members of
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
// Pointsize is returned in printers points (1/72 inch.)
const char* ResultIterator::WordFontAttributes(bool* is_bold,
bool* is_italic,
bool* is_underlined,
bool* is_monospace,
bool* is_serif,
bool* is_smallcaps,
int* pointsize,
int* font_id) const {
if (it_->word() == NULL) return NULL; // Already at the end!
*font_id = it_->word()->fontinfo_id;
if (*font_id < 0) return NULL; // No font available.
const UnicityTable<FontInfo> &font_table = tesseract_->get_fontinfo_table();
FontInfo font_info = font_table.get(*font_id);
*is_bold = font_info.is_bold();
*is_italic = font_info.is_italic();
*is_underlined = false; // TODO(rays) fix this!
*is_monospace = font_info.is_fixed_pitch();
*is_serif = font_info.is_serif();
*is_smallcaps = it_->word()->small_caps;
// The font size is calculated from a multiple of the x-height
// that came from the block.
float row_height = it_->row()->row->x_height() *
it_->block()->block->cell_over_xheight();
// Convert from pixels to printers points.
*pointsize = scaled_yres_ > 0
? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
: 0;
return font_info.name;
}
// Returns true if the current word was found in a dictionary.
bool ResultIterator::WordIsFromDictionary() const {
if (it_->word() == NULL) return false; // Already at the end!
int permuter = it_->word()->best_choice->permuter();
return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
permuter == USER_DAWG_PERM;
}
// Returns true if the current word is numeric.
bool ResultIterator::WordIsNumeric() const {
if (it_->word() == NULL) return false; // Already at the end!
int permuter = it_->word()->best_choice->permuter();
return permuter == NUMBER_PERM;
}
// Returns true if the current symbol is a superscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool ResultIterator::SymbolIsSuperscript() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
return false;
}
// Returns true if the current symbol is a subscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool ResultIterator::SymbolIsSubscript() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
return false;
}
// Returns true if the current symbol is a dropcap.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool ResultIterator::SymbolIsDropcap() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
return false;
}
ChoiceIterator::ChoiceIterator(const ResultIterator& result_it) {
ASSERT_HOST(result_it.it_->word() != NULL);
tesseract_ = result_it.tesseract_;
PAGE_RES_IT res_it(*result_it.it_);
WERD_CHOICE* best_choice = res_it.word()->best_choice;
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
if (choices != NULL) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
for (int blob = 0; blob < result_it.blob_index_; ++blob)
blob_choices_it.forward();
choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
choice_it_->mark_cycle_pt();
} else {
choice_it_ = NULL;
}
}
ChoiceIterator::~ChoiceIterator() {
delete choice_it_;
}
// Moves to the next choice for the symbol and returns false if there
// are none left.
bool ChoiceIterator::Next() {
if (choice_it_ == NULL)
return false;
choice_it_->forward();
return !choice_it_->cycled_list();
}
// Returns the null terminated UTF-8 encoded text string for the current
// choice. Use delete [] to free after use.
const char* ChoiceIterator::GetUTF8Text() const {
if (choice_it_ == NULL)
return NULL;
UNICHAR_ID id = choice_it_->data()->unichar_id();
if (id < 0 || id >= tesseract_->unicharset.size() ||
id == INVALID_UNICHAR_ID)
return NULL;
return tesseract_->unicharset.id_to_unichar(id);
}
// Returns the confidence of the current choice.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float ChoiceIterator::Confidence() const {
if (choice_it_ == NULL)
return 0.0f;
float confidence = 100 + 5 * choice_it_->data()->certainty();
if (confidence < 0.0f) confidence = 0.0f;
if (confidence > 100.0f) confidence = 100.0f;
return confidence;
}
} // namespace tesseract.

View File

@ -1,160 +0,0 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.h
// Description: Iterator for tesseract results that avoids using tesseract
// internal data structures.
// Author: Ray Smith
// Created: Fri Feb 26 11:01:06 PST 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_RESULTITERATOR_H__
#define TESSERACT_API_RESULTITERATOR_H__
#include "pageiterator.h"
class BLOB_CHOICE_IT;
namespace tesseract {
class Tesseract;
// Class to iterate over tesseract results, providing access to all levels
// of the page hierarchy, without including any tesseract headers or having
// to handle any tesseract structures.
// WARNING! This class points to data held within the TessBaseAPI class, and
// therefore can only be used while the TessBaseAPI class still exists and
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
// DetectOS, or anything else that changes the internal PAGE_RES.
// See apitypes.h for the definition of PageIteratorLevel.
// See also base class PageIterator, which contains the bulk of the interface.
// ResultIterator adds text-specific methods for access to OCR output.
class ResultIterator : public PageIterator {
friend class ChoiceIterator;
public:
// page_res and tesseract come directly from the BaseAPI.
// The rectangle parameters are copied indirectly from the Thresholder,
// via the BaseAPI. They represent the coordinates of some rectangle in an
// original image (in top-left-origin coordinates) and therefore the top-left
// needs to be added to any output boxes in order to specify coordinates
// in the original image. See TessBaseAPI::SetRectangle.
// The scale and scaled_yres are in case the Thresholder scaled the image
// rectangle prior to thresholding. Any coordinates in tesseract's image
// must be divided by scale before adding (rect_left, rect_top).
// The scaled_yres indicates the effective resolution of the binary image
// that tesseract has been given by the Thresholder.
// After the constructor, Begin has already been called.
ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
int scale, int scaled_yres,
int rect_left, int rect_top,
int rect_width, int rect_height);
virtual ~ResultIterator();
// ResultIterators may be copied! This makes it possible to iterate over
// all the objects at a lower level, while maintaining an iterator to
// objects at a higher level. These constructors DO NOT CALL Begin, so
// iterations will continue from the location of src.
// TODO: For now the copy constructor and operator= only need the base class
// versions, but if new data members are added, don't forget to add them!
// ============= Moving around within the page ============.
// See PageIterator.
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char* GetUTF8Text(PageIteratorLevel level) const;
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float Confidence(PageIteratorLevel level) const;
// ============= Functions that refer to words only ============.
// Returns the font attributes of the current word. If iterating at a higher
// level object than words, eg textlines, then this will return the
// attributes of the first word in that textline.
// The actual return value is a string representing a font name. It points
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
// the iterator itself, ie rendered invalid by various members of
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
// Pointsize is returned in printers points (1/72 inch.)
const char* WordFontAttributes(bool* is_bold,
bool* is_italic,
bool* is_underlined,
bool* is_monospace,
bool* is_serif,
bool* is_smallcaps,
int* pointsize,
int* font_id) const;
// Returns true if the current word was found in a dictionary.
bool WordIsFromDictionary() const;
// Returns true if the current word is numeric.
bool WordIsNumeric() const;
// ============= Functions that refer to symbols only ============.
// Returns true if the current symbol is a superscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsSuperscript() const;
// Returns true if the current symbol is a subscript.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsSubscript() const;
// Returns true if the current symbol is a dropcap.
// If iterating at a higher level object than symbols, eg words, then
// this will return the attributes of the first symbol in that word.
bool SymbolIsDropcap() const;
};
// Class to iterate over the classifier choices for a single RIL_SYMBOL.
class ChoiceIterator {
public:
// Construction is from a ResultIterator that points to the symbol of
// interest. The ChoiceIterator allows a one-shot iteration over the
// choices for this symbol and after that is is useless.
explicit ChoiceIterator(const ResultIterator& result_it);
~ChoiceIterator();
// Moves to the next choice for the symbol and returns false if there
// are none left.
bool Next();
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// choice.
// NOTE: Unlike ResultIterator::GetUTF8Text, the return points to an
// internal structure and should NOT be delete[]ed to free after use.
const char* GetUTF8Text() const;
// Returns the confidence of the current choice.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float Confidence() const;
private:
// Pointer to the Tesseract object owned by the API.
Tesseract* tesseract_;
// Iterator over the blob choices.
BLOB_CHOICE_IT* choice_it_;
};
} // namespace tesseract.
#endif // TESSERACT_API_RESULT_ITERATOR_H__

View File

@ -36,7 +36,7 @@ PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
rect_left_(rect_left), rect_top_(rect_top),
rect_width_(rect_width), rect_height_(rect_height) {
it_ = new PAGE_RES_IT(page_res);
Begin();
PageIterator::Begin();
}
PageIterator::~PageIterator() {
@ -73,6 +73,11 @@ const PageIterator& PageIterator::operator=(const PageIterator& src) {
return *this;
}
bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
return (it_ == NULL && it_ == other) ||
((other != NULL) && (it_ != NULL) && (*it_ == *other));
}
// ============= Moving around within the page ============.
// Resets the iterator to point to the start of the page.
@ -81,12 +86,38 @@ void PageIterator::Begin() {
BeginWord(0);
}
void PageIterator::RestartParagraph() {
if (it_->block() == NULL) return; // At end of the document.
PAGE_RES_IT para(page_res_);
PAGE_RES_IT next_para(para);
next_para.forward_paragraph();
while (next_para.cmp(*it_) <= 0) {
para = next_para;
next_para.forward_paragraph();
}
*it_ = para;
BeginWord(0);
}
bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
PageIterator p_start(*this);
p_start.RestartParagraph();
return p_start.it_->row() == it_->row();
}
void PageIterator::RestartRow() {
it_->restart_row();
BeginWord(0);
}
// Moves to the start of the next object at the given level in the
// page hierarchy, and returns false if the end of the page was reached.
// NOTE that RIL_SYMBOL will skip non-text blocks, but all other
// PageIteratorLevel level values will visit each non-text block once.
// Think of non text blocks as containing a single para, with a single line,
// with a single imaginary word.
// NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
// non-text block at least once.
// Think of non text blocks as containing a single para, with at least one
// line, with a single imaginary word, containing a single symbol.
// The bounding boxes mark out any polygonal nature of the block, and
// PTIsTextType(BLockType()) is false for non-text blocks.
// Calls to Next with different levels may be freely intermixed.
// This function iterates words in right-to-left scripts correctly, if
// the appropriate language has been loaded into Tesseract.
@ -97,9 +128,11 @@ bool PageIterator::Next(PageIteratorLevel level) {
switch (level) {
case RIL_BLOCK:
case RIL_PARA:
it_->forward_block();
break;
case RIL_PARA:
it_->forward_paragraph();
break;
case RIL_TEXTLINE:
for (it_->forward_with_empties(); it_->row() == it_->prev_row();
it_->forward_with_empties());
@ -112,7 +145,7 @@ bool PageIterator::Next(PageIteratorLevel level) {
cblob_it_->forward();
++blob_index_;
if (blob_index_ >= word_length_)
it_->forward();
it_->forward_with_empties();
else
return true;
break;
@ -129,10 +162,13 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
if (it_->word() == NULL) return true; // In an image block.
switch (level) {
case RIL_BLOCK:
return blob_index_ == 0 && it_->block() != it_->prev_block();
case RIL_PARA:
return it_->block() != it_->prev_block();
return blob_index_ == 0 &&
(it_->block() != it_->prev_block() ||
it_->row()->row->para() != it_->prev_row()->row->para());
case RIL_TEXTLINE:
return it_->row() != it_->prev_row();
return blob_index_ == 0 && it_->row() != it_->prev_row();
case RIL_WORD:
return blob_index_ == 0;
case RIL_SYMBOL:
@ -145,7 +181,7 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
// given level. (e.g. the last word in a line, the last line in a block)
bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const {
if (it_->word() == NULL) return true; // Already at the end!
if (Empty(element)) return true; // Already at the end!
// The result is true if we step forward by element and find we are
// at the the end of the page or at beginning of *all* levels in:
// [level, element).
@ -154,7 +190,7 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
// word on a line, so we also have to be at the first symbol in a word.
PageIterator next(*this);
next.Next(element);
if (next.it_->word() == NULL) return true; // Reached the end of the page.
if (next.Empty(element)) return true; // Reached the end of the page.
while (element > level) {
element = static_cast<PageIteratorLevel>(element - 1);
if (!next.IsAtBeginningOf(element))
@ -163,6 +199,21 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
return true;
}
// Returns whether this iterator is positioned
// before other: -1
// equal to other: 0
// after other: 1
int PageIterator::Cmp(const PageIterator &other) const {
int word_cmp = it_->cmp(*other.it_);
if (word_cmp != 0)
return word_cmp;
if (blob_index_ < other.blob_index_)
return -1;
if (blob_index_ == other.blob_index_)
return 0;
return 1;
}
// ============= Accessing data ==============.
// Coordinate system:
// Integer coordinates are at the cracks between the pixels.
@ -176,22 +227,25 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
// If an image rectangle has been set in the API, then returned coordinates
// relate to the original (full) image, rather than the rectangle.
// Returns the bounding rectangle of the current object at the given level.
// Returns the bounding rectangle of the current object at the given level in
// the coordinates of the working image that is pix_binary().
// See comment on coordinate system above.
// Returns false if there is no such object at the current position.
bool PageIterator::BoundingBox(PageIteratorLevel level,
int* left, int* top,
int* right, int* bottom) const {
if (it_->block() == NULL) return false; // Already at the end!
if (it_->word() == NULL && level != RIL_BLOCK) return false;
if (level == RIL_SYMBOL && blob_index_ >= word_length_)
return false; // Zero length word, or already at the end of it.
bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
int* left, int* top,
int* right, int* bottom) const {
if (Empty(level))
return false;
TBOX box;
PARA *para = NULL;
switch (level) {
case RIL_BLOCK:
case RIL_PARA:
box = it_->block()->block->bounding_box();
break;
case RIL_PARA:
para = it_->row()->row->para();
if (para == NULL) return false;
// explicit fall-through.
case RIL_TEXTLINE:
box = it_->row()->row->bounding_box();
break;
@ -204,22 +258,59 @@ bool PageIterator::BoundingBox(PageIteratorLevel level,
else
box = cblob_it_->data()->bounding_box();
}
if (level == RIL_PARA) {
PageIterator other = *this;
other.Begin();
do {
if (other.it_->row() && other.it_->row()->row &&
other.it_->row()->row->para() == para) {
box = box.bounding_union(other.it_->row()->row->bounding_box());
}
} while (other.Next(RIL_TEXTLINE));
}
if (level != RIL_SYMBOL || cblob_it_ != NULL)
box.rotate(it_->block()->block->re_rotation());
// Now we have a box in tesseract coordinates relative to the image rectangle,
// we have to convert the coords to global page coords in a top-down system.
*left = ClipToRange(box.left() / scale_ + rect_left_,
// we have to convert the coords to a top-down system.
const int pix_height = pixGetHeight(tesseract_->pix_binary());
const int pix_width = pixGetWidth(tesseract_->pix_binary());
*left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
*top = ClipToRange(pix_height - box.top(), 0, pix_height);
*right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
*bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
return true;
}
// Returns the bounding rectangle of the current object at the given level in
// coordinates of the original image.
// See comment on coordinate system above.
// Returns false if there is no such object at the current position.
bool PageIterator::BoundingBox(PageIteratorLevel level,
int* left, int* top,
int* right, int* bottom) const {
if (!BoundingBoxInternal(level, left, top, right, bottom))
return false;
// Convert to the coordinate system of the original image.
*left = ClipToRange(*left / scale_ + rect_left_,
rect_left_, rect_left_ + rect_width_);
*top = ClipToRange((rect_height_ - box.top()) / scale_ + rect_top_,
*top = ClipToRange(*top / scale_ + rect_top_,
rect_top_, rect_top_ + rect_height_);
*right = ClipToRange((box.right() + scale_ - 1) / scale_ + rect_left_,
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_,
*left, rect_left_ + rect_width_);
*bottom = ClipToRange((rect_height_ - box.bottom() + scale_ - 1) / scale_
+ rect_top_,
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_,
*top, rect_top_ + rect_height_);
return true;
}
// Return that there is no such object at a given level.
bool PageIterator::Empty(PageIteratorLevel level) const {
if (it_->block() == NULL) return true; // Already at the end!
if (it_->word() == NULL && level != RIL_BLOCK) return true; // image block
if (level == RIL_SYMBOL && blob_index_ >= word_length_)
return true; // Zero length word, or already at the end of it.
return false;
}
// Returns the type of the current block. See apitypes.h for PolyBlockType.
PolyBlockType PageIterator::BlockType() const {
if (it_->block() == NULL || it_->block()->block == NULL)
@ -230,7 +321,8 @@ PolyBlockType PageIterator::BlockType() const {
}
// Returns a binary image of the current object at the given level.
// The position and size match the return from BoundingBox.
// The position and size match the return from BoundingBoxInternal, and so this
// could be upscaled with respect to the original input image.
// Use pixDestroy to delete the image after use.
// The following methods are used to generate the images:
// RIL_BLOCK: mask the page image with the block polygon.
@ -250,22 +342,23 @@ PolyBlockType PageIterator::BlockType() const {
// components.
Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
int left, top, right, bottom;
if (!BoundingBox(level, &left, &top, &right, &bottom))
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
return NULL;
Pix* pix = NULL;
switch (level) {
case RIL_BLOCK:
case RIL_PARA:
pix = it_->block()->block->render_mask();
// AND the mask and the image.
pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
PIX_SRC & PIX_DST, tesseract_->pix_binary(),
left, top);
break;
case RIL_PARA:
case RIL_TEXTLINE:
case RIL_WORD:
case RIL_SYMBOL:
if (level == RIL_SYMBOL && cblob_it_ != NULL)
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
cblob_it_->data()->area() != 0)
return cblob_it_->data()->render();
// Just clip from the bounding box.
Box* box = boxCreate(left, top, right - left, bottom - top);
@ -301,7 +394,7 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
Pix* grey_pix = pixClipRectangle(pix, box, NULL);
boxDestroy(&box);
if (level == RIL_BLOCK || level == RIL_PARA) {
if (level == RIL_BLOCK) {
Pix* mask = it_->block()->block->render_mask();
Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
pixRasterop(expanded_mask, padding, padding,
@ -316,7 +409,6 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
return grey_pix;
}
// Returns the baseline of the current object at the given level.
// The baseline is the line that passes through (x1, y1) and (x2, y2).
// WARNING: with vertical text, baselines may be vertical!
@ -345,7 +437,7 @@ bool PageIterator::Baseline(PageIteratorLevel level,
void PageIterator::Orientation(tesseract::Orientation *orientation,
tesseract::WritingDirection *writing_direction,
tesseract::TextlineOrder *textline_order,
float *deskew_angle) {
float *deskew_angle) const {
BLOCK* block = it_->block()->block;
// Orientation
@ -388,6 +480,22 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
*deskew_angle = -skew.angle();
}
void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
bool *is_list_item,
bool *is_crown,
int *first_line_indent) const {
*just = tesseract::JUSTIFICATION_UNKNOWN;
if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
!it_->row()->row->para()->model)
return;
PARA *para = it_->row()->row->para();
*is_list_item = para->is_list_item;
*is_crown = para->is_very_first_or_continuation;
*first_line_indent = para->model->first_indent() -
para->model->body_indent();
}
// Sets up the internal data for iterating the blobs of a new word, then
// moves the iterator to the given offset.
void PageIterator::BeginWord(int offset) {
@ -404,6 +512,12 @@ void PageIterator::BeginWord(int offset) {
// is already baseline denormalized.
word_length_ = word_res->best_choice->length();
ASSERT_HOST(word_res->box_word != NULL);
if (word_res->box_word->length() != word_length_) {
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
word_length_, word_res->best_choice->unichar_string().string(),
word_res->box_word->length());
word_res->box_word->bounding_box().print();
}
ASSERT_HOST(word_res->box_word->length() == word_length_);
word_ = NULL;
// We will be iterating the box_word.

View File

@ -18,10 +18,10 @@
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_API_PAGEITERATOR_H__
#define TESSERACT_API_PAGEITERATOR_H__
#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
#define TESSERACT_CCMAIN_PAGEITERATOR_H__
#include "apitypes.h"
#include "publictypes.h"
class C_BLOB_IT;
class PBLOB_IT;
@ -72,10 +72,27 @@ class PageIterator {
PageIterator(const PageIterator& src);
const PageIterator& operator=(const PageIterator& src);
// Are we positioned at the same location as other?
bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
// ============= Moving around within the page ============.
// Moves the iterator to point to the start of the page to begin an iteration.
void Begin();
virtual void Begin();
// Moves the iterator to the beginning of the paragraph.
// This class implements this functionality by moving it to the zero indexed
// blob of the first (leftmost) word on the first row of the paragraph.
virtual void RestartParagraph();
// Return whether this iterator points anywhere in the first textline of a
// paragraph.
bool IsWithinFirstTextlineOfParagraph() const;
// Moves the iterator to the beginning of the text line.
// This class implements this functionality by moving it to the zero indexed
// blob of the first (leftmost) word of the row.
virtual void RestartRow();
// Moves to the start of the next object at the given level in the
// page hierarchy, and returns false if the end of the page was reached.
@ -86,17 +103,43 @@ class PageIterator {
// Calls to Next with different levels may be freely intermixed.
// This function iterates words in right-to-left scripts correctly, if
// the appropriate language has been loaded into Tesseract.
bool Next(PageIteratorLevel level);
virtual bool Next(PageIteratorLevel level);
// Returns true if the iterator is at the start of an object at the given
// level. Possible uses include determining if a call to Next(RIL_WORD)
// moved to the start of a RIL_PARA.
bool IsAtBeginningOf(PageIteratorLevel level) const;
// level.
//
// For instance, suppose an iterator it is pointed to the first symbol of the
// first word of the third line of the second paragraph of the first block in
// a page, then:
// it.IsAtBeginningOf(RIL_BLOCK) = false
// it.IsAtBeginningOf(RIL_PARA) = false
// it.IsAtBeginningOf(RIL_TEXTLINE) = true
// it.IsAtBeginningOf(RIL_WORD) = true
// it.IsAtBeginningOf(RIL_SYMBOL) = true
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
// Returns whether the iterator is positioned at the last element in a
// given level. (e.g. the last word in a line, the last line in a block)
bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const;
//
// Here's some two-paragraph example
// text. It starts off innocuously
// enough but quickly turns bizarre.
// The author inserts a cornucopia
// of words to guard against confused
// references.
//
// Now take an iterator it pointed to the start of "bizarre."
// it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
// it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
// it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
virtual bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const;
// Returns whether this iterator is positioned
// before other: -1
// equal to other: 0
// after other: 1
int Cmp(const PageIterator &other) const;
// ============= Accessing data ==============.
// Coordinate system:
@ -120,12 +163,21 @@ class PageIterator {
// the image to include more foreground pixels. See GetImage below.
bool BoundingBox(PageIteratorLevel level,
int* left, int* top, int* right, int* bottom) const;
// Returns the bounding rectangle of the object in a coordinate system of the
// working image rectangle having its origin at (rect_left_, rect_top_) with
// respect to the original image and is scaled by a factor scale_.
bool BoundingBoxInternal(PageIteratorLevel level,
int* left, int* top, int* right, int* bottom) const;
// Returns whether there is no object of a given level.
bool Empty(PageIteratorLevel level) const;
// Returns the type of the current block. See apitypes.h for PolyBlockType.
PolyBlockType BlockType() const;
// Returns a binary image of the current object at the given level.
// The position and size match the return from BoundingBox.
// The position and size match the return from BoundingBoxInternal, and so
// this could be upscaled with respect to the original input image.
// Use pixDestroy to delete the image after use.
Pix* GetBinaryImage(PageIteratorLevel level) const;
@ -156,7 +208,38 @@ class PageIterator {
void Orientation(tesseract::Orientation *orientation,
tesseract::WritingDirection *writing_direction,
tesseract::TextlineOrder *textline_order,
float *deskew_angle);
float *deskew_angle) const;
// Returns information about the current paragraph, if available.
//
// justification -
// LEFT if ragged right, or fully justified and script is left-to-right.
// RIGHT if ragged left, or fully justified and script is right-to-left.
// unknown if it looks like source code or we have very few lines.
// is_list_item -
// true if we believe this is a member of an ordered or unordered list.
// is_crown -
// true if the first line of the paragraph is aligned with the other
// lines of the paragraph even though subsequent paragraphs have first
// line indents. This typically indicates that this is the continuation
// of a previous paragraph or that it is the very first paragraph in
// the chapter.
// first_line_indent -
// For LEFT aligned paragraphs, the first text line of paragraphs of
// this kind are indented this many pixels from the left edge of the
// rest of the paragraph.
// for RIGHT aligned paragraphs, the first text line of paragraphs of
// this kind are indented this many pixels from the right edge of the
// rest of the paragraph.
// NOTE 1: This value may be negative.
// NOTE 2: if *is_crown == true, the first line of this paragraph is
// actually flush, and first_line_indent is set to the "common"
// first_line_indent for subsequent paragraphs in this block
// of text.
void ParagraphInfo(tesseract::ParagraphJustification *justification,
bool *is_list_item,
bool *is_crown,
int *first_line_indent) const;
protected:
// Sets up the internal data for iterating the blobs of a new word, then
@ -192,4 +275,4 @@ class PageIterator {
} // namespace tesseract.
#endif // TESSERACT_API_PAGEITERATOR_H__
#endif // TESSERACT_CCMAIN_PAGEITERATOR_H__

663
ccmain/resultiterator.cpp Normal file
View File

@ -0,0 +1,663 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.cpp
// Description: Iterator for tesseract results that is capable of
// iterating in proper reading order over Bi Directional
// (e.g. mixed Hebrew and English) text.
// Author: David Eger
// Created: Fri May 27 13:58:06 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "resultiterator.h"
#include "allheaders.h"
#include "pageres.h"
#include "strngs.h"
#include "tesseractclass.h"
#include "unicharset.h"
#include "unicodes.h"
namespace tesseract {
ResultIterator::ResultIterator(const LTRResultIterator &resit)
: LTRResultIterator(resit) {
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
MoveToLogicalStartOfTextline();
}
ResultIterator *ResultIterator::StartOfParagraph(
const LTRResultIterator &resit) {
return new ResultIterator(resit);
}
bool ResultIterator::ParagraphIsLtr() const {
return current_paragraph_is_ltr_;
}
bool ResultIterator::CurrentParagraphIsLtr() const {
if (!it_->word())
return true; // doesn't matter.
LTRResultIterator it(*this);
it.RestartParagraph();
// Try to figure out the ltr-ness of the paragraph. The rules below
// make more sense in the context of a difficult paragraph example.
// Here we denote {ltr characters, RTL CHARACTERS}:
//
// "don't go in there!" DAIS EH
// EHT OTNI DEPMUJ FELSMIH NEHT DNA
// .GNIDLIUB GNINRUB
//
// On the first line, the left-most word is LTR and the rightmost word
// is RTL. Thus, we are better off taking the majority direction for
// the whole paragraph contents. So instead of "the leftmost word is LTR"
// indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
// would not do: Typically an RTL paragraph would *not* start with an LTR
// word. So our heuristics are as follows:
//
// (1) If the first text line has an RTL word in the left-most position
// it is RTL.
// (2) If the first text line has an LTR word in the right-most position
// it is LTR.
// (3) If neither of the above is true, take the majority count for the
// paragraph -- if there are more rtl words, it is RTL. If there
// are more LTR words, it's LTR.
bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
int num_ltr, num_rtl;
num_rtl = leftmost_rtl ? 1 : 0;
num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
for (it.Next(RIL_WORD);
!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
it.Next(RIL_WORD)) {
StrongScriptDirection dir = it.WordDirection();
rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
num_ltr += rightmost_ltr ? 1 : 0;
}
if (leftmost_rtl)
return false;
if (rightmost_ltr)
return true;
// First line is ambiguous. Take statistics on the whole paragraph.
if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
StrongScriptDirection dir = it.WordDirection();
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
} while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
return num_ltr >= num_rtl;
}
const int ResultIterator::kMinorRunStart = -1;
const int ResultIterator::kMinorRunEnd = -2;
const int ResultIterator::kComplexWord = -3;
void ResultIterator::CalculateBlobOrder(
GenericVector<int> *blob_indices) const {
bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
blob_indices->clear();
if (Empty(RIL_WORD)) return;
if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
// Easy! just return the blobs in order;
for (int i = 0; i < word_length_; i++)
blob_indices->push_back(i);
return;
}
// The blobs are in left-to-right order, but the current reading context
// is right-to-left.
const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
// Step 1: Scan for and mark European Number sequences
// [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
GenericVector<int> letter_types;
for (int i = 0; i < word_length_; i++) {
letter_types.push_back(it_->word()->SymbolDirection(i));
}
// Convert a single separtor sandwiched between two EN's into an EN.
for (int i = 0; i + 2 < word_length_; i++) {
if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
(letter_types[i + 1] == U_EURO_NUM_SEP ||
letter_types[i + 1] == U_COMMON_NUM_SEP)) {
letter_types[i + 1] = U_EURO_NUM;
}
}
// Scan for sequences of European Number Terminators around ENs and convert
// them to ENs.
for (int i = 0; i < word_length_; i++) {
if (letter_types[i] == U_EURO_NUM_TERM) {
int j = i + 1;
while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
// The sequence [i..j] should be converted to all European Numbers.
for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
}
j = i - 1;
while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
if (j > -1 && letter_types[j] == U_EURO_NUM) {
// The sequence [j..i] should be converted to all European Numbers.
for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
}
}
}
// Step 2: Convert all remaining types to either L or R.
// Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
// All other are R.
for (int i = 0; i < word_length_;) {
int ti = letter_types[i];
if (ti == U_LTR || ti == U_EURO_NUM) {
// Left to right sequence; scan to the end of it.
int last_good = i;
for (int j = i + 1; j < word_length_; j++) {
int tj = letter_types[j];
if (tj == U_LTR || tj == U_EURO_NUM) {
last_good = j;
} else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
// do nothing.
} else {
break;
}
}
// [i..last_good] is the L sequence
for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
i = last_good + 1;
} else {
letter_types[i] = U_RTL;
i++;
}
}
// At this point, letter_types is entirely U_LTR or U_RTL.
for (int i = word_length_ - 1; i >= 0;) {
if (letter_types[i] == U_RTL) {
blob_indices->push_back(i);
i--;
} else {
// left to right sequence. scan to the beginning.
int j = i - 1;
for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
// Now (j, i] is LTR
for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
i = j;
}
}
ASSERT_HOST(blob_indices->size() == word_length_);
}
static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
for (int i = 0; i < dirs.size(); i++) {
switch (dirs[i]) {
case DIR_NEUTRAL: tprintf ("N "); break;
case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
case DIR_MIX: tprintf("Z "); break;
default: tprintf("? "); break;
}
}
tprintf("\n");
}
void ResultIterator::CalculateTextlineOrder(
bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVectorEqEq<int> *word_indices) const {
GenericVector<StrongScriptDirection> directions;
CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
}
void ResultIterator::CalculateTextlineOrder(
bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVector<StrongScriptDirection> *dirs_arg,
GenericVectorEqEq<int> *word_indices) const {
GenericVector<StrongScriptDirection> dirs;
GenericVector<StrongScriptDirection> *directions;
directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
directions->truncate(0);
// A LTRResultIterator goes strictly left-to-right word order.
LTRResultIterator ltr_it(resit);
ltr_it.RestartRow();
if (ltr_it.Empty(RIL_WORD)) return;
do {
directions->push_back(ltr_it.WordDirection());
} while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
word_indices->truncate(0);
CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
}
void ResultIterator::CalculateTextlineOrder(
bool paragraph_is_ltr,
const GenericVector<StrongScriptDirection> &word_dirs,
GenericVectorEqEq<int> *reading_order) {
reading_order->truncate(0);
if (word_dirs.size() == 0) return;
// Take all of the runs of minor direction words and insert them
// in reverse order.
int minor_direction, major_direction, major_step, start, end;
if (paragraph_is_ltr) {
start = 0;
end = word_dirs.size();
major_step = 1;
major_direction = DIR_LEFT_TO_RIGHT;
minor_direction = DIR_RIGHT_TO_LEFT;
} else {
start = word_dirs.size() - 1;
end = -1;
major_step = -1;
major_direction = DIR_RIGHT_TO_LEFT;
minor_direction = DIR_LEFT_TO_RIGHT;
// Special rule: if there are neutral words at the right most side
// of a line adjacent to a left-to-right word in the middle of the
// line, we interpret the end of the line as a single LTR sequence.
if (word_dirs[start] == DIR_NEUTRAL) {
int neutral_end = start;
while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
neutral_end--;
}
if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
// LTR followed by neutrals.
// Scan for the beginning of the minor left-to-right run.
int left = neutral_end;
for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
}
reading_order->push_back(kMinorRunStart);
for (int i = left; i < word_dirs.size(); i++) {
reading_order->push_back(i);
if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
}
reading_order->push_back(kMinorRunEnd);
start = left - 1;
}
}
}
for (int i = start; i != end;) {
if (word_dirs[i] == minor_direction) {
int j = i;
while (j != end && word_dirs[j] != major_direction)
j += major_step;
if (j == end) j -= major_step;
while (j != i && word_dirs[j] != minor_direction)
j -= major_step;
// [j..i] is a minor direction run.
reading_order->push_back(kMinorRunStart);
for (int k = j; k != i; k -= major_step) {
reading_order->push_back(k);
}
reading_order->push_back(i);
reading_order->push_back(kMinorRunEnd);
i = j + major_step;
} else {
reading_order->push_back(i);
if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
i += major_step;
}
}
}
int ResultIterator::LTRWordIndex() const {
int this_word_index = 0;
LTRResultIterator textline(*this);
textline.RestartRow();
while (!textline.PositionedAtSameWord(it_)) {
this_word_index++;
textline.Next(RIL_WORD);
}
return this_word_index;
}
void ResultIterator::MoveToLogicalStartOfWord() {
if (word_length_ == 0) {
BeginWord(0);
return;
}
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
if (blob_order.size() == 0 || blob_order[0] == 0) return;
BeginWord(blob_order[0]);
}
bool ResultIterator::IsAtFinalSymbolOfWord() const {
if (!it_->word()) return true;
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
return blob_order.size() == 0 || blob_order.back() == blob_index_;
}
bool ResultIterator::IsAtFirstSymbolOfWord() const {
if (!it_->word()) return true;
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
return blob_order.size() == 0 || blob_order[0] == blob_index_;
}
void ResultIterator::AppendSuffixMarks(STRING *text) const {
if (!it_->word()) return;
bool reading_direction_is_ltr =
current_paragraph_is_ltr_ ^ in_minor_direction_;
// scan forward to see what meta-information the word ordering algorithm
// left us.
// If this word is at the *end* of a minor run, insert the other
// direction's mark; else if this was a complex word, insert the
// current reading order's mark.
GenericVectorEqEq<int> textline_order;
CalculateTextlineOrder(current_paragraph_is_ltr_,
*this, &textline_order);
int this_word_index = LTRWordIndex();
int i = textline_order.get_index(this_word_index);
if (i < 0) return;
int last_non_word_mark = 0;
for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
last_non_word_mark = textline_order[i];
}
if (last_non_word_mark == kComplexWord) {
*text += reading_direction_is_ltr ? kLRM : kRLM;
} else if (last_non_word_mark == kMinorRunEnd) {
if (current_paragraph_is_ltr_) {
*text += kRLM;
*text += kLRM;
} else {
*text += kRLM;
*text += kLRM;
}
}
}
void ResultIterator::MoveToLogicalStartOfTextline() {
GenericVectorEqEq<int> word_indices;
RestartRow();
CalculateTextlineOrder(current_paragraph_is_ltr_,
dynamic_cast<const LTRResultIterator&>(*this),
&word_indices);
int i = 0;
for (; i < word_indices.size() && word_indices[i] < 0; i++) {
if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
}
if (in_minor_direction_) at_beginning_of_minor_run_ = true;
if (i >= word_indices.size()) return;
int first_word_index = word_indices[i];
for (int j = 0; j < first_word_index; j++) {
PageIterator::Next(RIL_WORD);
}
MoveToLogicalStartOfWord();
}
void ResultIterator::Begin() {
LTRResultIterator::Begin();
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
MoveToLogicalStartOfTextline();
}
bool ResultIterator::Next(PageIteratorLevel level) {
if (it_->block() == NULL) return false; // already at end!
switch (level) {
case RIL_BLOCK: // explicit fall-through
case RIL_PARA: // explicit fall-through
case RIL_TEXTLINE:
if (!PageIterator::Next(level)) return false;
if (IsWithinFirstTextlineOfParagraph()) {
// if we've advanced to a new paragraph,
// recalculate current_paragraph_is_ltr_
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
}
in_minor_direction_ = false;
MoveToLogicalStartOfTextline();
return it_->block() != NULL;
case RIL_SYMBOL:
{
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
int next_blob = 0;
while (next_blob < blob_order.size() &&
blob_index_ != blob_order[next_blob])
next_blob++;
next_blob++;
if (next_blob < blob_order.size()) {
// we're in the same word; simply advance one blob.
BeginWord(blob_order[next_blob]);
at_beginning_of_minor_run_ = false;
return true;
}
level = RIL_WORD; // we've fallen through to the next word.
}
case RIL_WORD: // explicit fall-through.
{
if (it_->word() == NULL) return Next(RIL_BLOCK);
GenericVectorEqEq<int> word_indices;
int this_word_index = LTRWordIndex();
CalculateTextlineOrder(current_paragraph_is_ltr_,
*this,
&word_indices);
int final_real_index = word_indices.size() - 1;
while (final_real_index > 0 && word_indices[final_real_index] < 0)
final_real_index--;
for (int i = 0; i < final_real_index; i++) {
if (word_indices[i] == this_word_index) {
int j = i + 1;
for (; j < final_real_index && word_indices[j] < 0; j++) {
if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
}
at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
// awesome, we move to word_indices[j]
if (BidiDebug(3)) {
tprintf("Next(RIL_WORD): %d -> %d\n",
this_word_index, word_indices[j]);
}
PageIterator::RestartRow();
for (int k = 0; k < word_indices[j]; k++) {
PageIterator::Next(RIL_WORD);
}
MoveToLogicalStartOfWord();
return true;
}
}
if (BidiDebug(3)) {
tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
}
// we're going off the end of the text line.
return Next(RIL_TEXTLINE);
}
}
ASSERT_HOST(false); // shouldn't happen.
return false;
}
bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
if (it_->block() == NULL) return false; // Already at the end!
if (it_->word() == NULL) return true; // In an image block.
if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
bool at_word_start = IsAtFirstSymbolOfWord();
if (level == RIL_WORD) return at_word_start;
ResultIterator line_start(*this);
// move to the first word in the line...
line_start.MoveToLogicalStartOfTextline();
bool at_textline_start = at_word_start && *line_start.it_ == *it_;
if (level == RIL_TEXTLINE) return at_textline_start;
// now we move to the left-most word...
line_start.RestartRow();
bool at_block_start = at_textline_start &&
line_start.it_->block() != line_start.it_->prev_block();
if (level == RIL_BLOCK) return at_block_start;
bool at_para_start = at_block_start ||
(at_textline_start &&
line_start.it_->row()->row->para() !=
line_start.it_->prev_row()->row->para());
if (level == RIL_PARA) return at_para_start;
ASSERT_HOST(false); // shouldn't happen.
return false;
}
// NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
// change that the variable next is now a ResultIterator instead of a
// PageIterator.
bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const {
if (Empty(element)) return true; // Already at the end!
// The result is true if we step forward by element and find we are
// at the the end of the page or at beginning of *all* levels in:
// [level, element).
// When there is more than one level difference between element and level,
// we could for instance move forward one symbol and still be at the first
// word on a line, so we also have to be at the first symbol in a word.
ResultIterator next(*this);
next.Next(element);
if (next.Empty(element)) return true; // Reached the end of the page.
while (element > level) {
element = static_cast<PageIteratorLevel>(element - 1);
if (!next.IsAtBeginningOf(element))
return false;
}
return true;
}
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
if (it_->word() == NULL) return NULL; // Already at the end!
STRING text;
switch (level) {
case RIL_BLOCK:
{
ResultIterator pp(*this);
do {
pp.AppendUTF8ParagraphText(&text);
} while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
}
break;
case RIL_PARA:
AppendUTF8ParagraphText(&text);
break;
case RIL_TEXTLINE:
{
ResultIterator it(*this);
it.MoveToLogicalStartOfTextline();
it.IterateAndAppendUTF8TextlineText(&text);
}
break;
case RIL_WORD:
AppendUTF8WordText(&text);
break;
case RIL_SYMBOL:
{
bool reading_direction_is_ltr =
current_paragraph_is_ltr_ ^ in_minor_direction_;
if (at_beginning_of_minor_run_) {
text += reading_direction_is_ltr ? kLRM : kRLM;
}
text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
}
break;
}
int length = text.length() + 1;
char* result = new char[length];
strncpy(result, text.string(), length);
return result;
}
void ResultIterator::AppendUTF8WordText(STRING *text) const {
if (!it_->word()) return;
ASSERT_HOST(it_->word()->best_choice != NULL);
bool reading_direction_is_ltr =
current_paragraph_is_ltr_ ^ in_minor_direction_;
if (at_beginning_of_minor_run_) {
*text += reading_direction_is_ltr ? kLRM : kRLM;
}
GenericVector<int> blob_order;
CalculateBlobOrder(&blob_order);
for (int i = 0; i < blob_order.size(); i++) {
*text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
}
AppendSuffixMarks(text);
}
void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
if (Empty(RIL_WORD)) {
Next(RIL_WORD);
return;
}
if (BidiDebug(1)) {
GenericVectorEqEq<int> textline_order;
GenericVector<StrongScriptDirection> dirs;
CalculateTextlineOrder(current_paragraph_is_ltr_,
*this, &dirs, &textline_order);
tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
current_paragraph_is_ltr_ ? "ltr" : "rtl");
PrintScriptDirs(dirs);
tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
current_paragraph_is_ltr_ ? "ltr" : "rtl");
for (int i = 0; i < textline_order.size(); i++) {
tprintf("%d ", textline_order[i]);
}
tprintf("\n");
}
int words_appended = 0;
do {
AppendUTF8WordText(text);
words_appended++;
*text += " ";
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
if (BidiDebug(1)) {
tprintf("%d words printed\n", words_appended);
}
text->truncate_at(text->length() - 1);
*text += line_separator_;
// If we just finished a paragraph, add an extra newline.
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
*text += paragraph_separator_;
}
void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
ResultIterator it(*this);
it.RestartParagraph();
it.MoveToLogicalStartOfTextline();
if (it.Empty(RIL_WORD)) return;
do {
it.IterateAndAppendUTF8TextlineText(text);
} while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
}
bool ResultIterator::BidiDebug(int min_level) const {
int debug_level = 1;
IntParam *p = ParamUtils::FindParam<IntParam>(
"bidi_debug", GlobalParams()->int_params,
tesseract_->params()->int_params);
if (p != NULL) debug_level = (inT32)(*p);
return debug_level >= min_level;
}
} // namespace tesseract.

198
ccmain/resultiterator.h Normal file
View File

@ -0,0 +1,198 @@
///////////////////////////////////////////////////////////////////////
// File: resultiterator.h
// Description: Iterator for tesseract results that is capable of
// iterating in proper reading order over Bi Directional
// (e.g. mixed Hebrew and English) text.
// Author: David Eger
// Created: Fri May 27 13:58:06 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H__
#include "ltrresultiterator.h"
#include "genericvector.h"
class BLOB_CHOICE_IT;
class WERD_RES;
class STRING;
namespace tesseract {
class Tesseract;
class ResultIterator : public LTRResultIterator {
public:
static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
// ResultIterator is copy constructible!
// The default copy constructor works just fine for us.
virtual ~ResultIterator() {}
// ============= Moving around within the page ============.
// Moves the iterator to point to the start of the page to begin an iteration.
virtual void Begin();
// Moves to the start of the next object at the given level in the
// page hierarchy in the appropriate reading order and returns false if
// the end of the page was reached.
// NOTE that RIL_SYMBOL will skip non-text blocks, but all other
// PageIteratorLevel level values will visit each non-text block once.
// Think of non text blocks as containing a single para, with a single line,
// with a single imaginary word.
// Calls to Next with different levels may be freely intermixed.
// This function iterates words in right-to-left scripts correctly, if
// the appropriate language has been loaded into Tesseract.
virtual bool Next(PageIteratorLevel level);
// IsAtBeginningOf() returns whether we're at the logical beginning of the
// given level. (as opposed to ResultIterator's left-to-right top-to-bottom
// order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
// For a full description, see pageiterator.h
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
// Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
// For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
// point at the last word in a paragraph. See PageIterator for full comment.
virtual bool IsAtFinalElement(PageIteratorLevel level,
PageIteratorLevel element) const;
// ============= Accessing data ==============.
// Returns the null terminated UTF-8 encoded text string for the current
// object at the given level. Use delete [] to free after use.
virtual char* GetUTF8Text(PageIteratorLevel level) const;
// Return whether the current paragraph's dominant reading direction
// is left-to-right (as opposed to right-to-left).
bool ParagraphIsLtr() const;
// ============= Exposed only for testing =============.
// Yields the reading order as a sequence of indices and (optional)
// meta-marks for a set of words (given left-to-right).
// The meta marks are passed as negative values:
// kMinorRunStart Start of minor direction text.
// kMinorRunEnd End of minor direction text.
// kComplexWord The next indexed word contains both left-to-right and
// right-to-left characters and was treated as neutral.
//
// For example, suppose we have five words in a text line,
// indexed [0,1,2,3,4] from the leftmost side of the text line.
// The following are all believable reading_orders:
//
// Left-to-Right (in ltr paragraph):
// { 0, 1, 2, 3, 4 }
// Left-to-Right (in rtl paragraph):
// { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
// Right-to-Left (in rtl paragraph):
// { 4, 3, 2, 1, 0 }
// Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
// { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
static void CalculateTextlineOrder(
bool paragraph_is_ltr,
const GenericVector<StrongScriptDirection> &word_dirs,
GenericVectorEqEq<int> *reading_order);
static const int kMinorRunStart;
static const int kMinorRunEnd;
static const int kComplexWord;
protected:
// We presume the data associated with the given iterator will outlive us.
// NB: This is private because it does something that is non-obvious:
// it resets to the beginning of the paragraph instead of staying wherever
// resit might have pointed.
explicit ResultIterator(const LTRResultIterator &resit);
private:
// Calculates the current paragraph's dominant writing direction.
// Typically, members should use current_paragraph_ltr_ instead.
bool CurrentParagraphIsLtr() const;
// Returns word indices as measured from resit->RestartRow() = index 0
// for the reading order of words within a textline given an iterator
// into the middle of the text line.
// In addition to non-negative word indices, the following negative values
// may be inserted:
// kMinorRunStart Start of minor direction text.
// kMinorRunEnd End of minor direction text.
// kComplexWord The previous word contains both left-to-right and
// right-to-left characters and was treated as neutral.
void CalculateTextlineOrder(bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVectorEqEq<int> *indices) const;
// Same as above, but the caller's ssd gets filled in if ssd != NULL.
void CalculateTextlineOrder(bool paragraph_is_ltr,
const LTRResultIterator &resit,
GenericVector<StrongScriptDirection> *ssd,
GenericVectorEqEq<int> *indices) const;
// What is the index of the current word in a strict left-to-right reading
// of the row?
int LTRWordIndex() const;
// Given an iterator pointing at a word, returns the logical reading order
// of blob indices for the word.
void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
// Precondition: current_paragraph_is_ltr_ is set.
void MoveToLogicalStartOfTextline();
// Precondition: current_paragraph_is_ltr_ and in_minor_direction_ are set.
void MoveToLogicalStartOfWord();
// Are we pointing at the final (reading order) symbol of the word?
bool IsAtFinalSymbolOfWord() const;
// Are we pointing at the first (reading order) symbol of the word?
bool IsAtFirstSymbolOfWord() const;
// Append any extra marks that should be appended to this word when printed.
// Mostly, these are Unicode BiDi control characters.
void AppendSuffixMarks(STRING *text) const;
// Appends the current word in reading order to the given buffer.
void AppendUTF8WordText(STRING *text) const;
// Appends the text of the current text line, *assuming this iterator is
// positioned at the beginning of the text line* This function
// updates the iterator to point to the first position past the text line.
// Each textline is terminated in a single newline character.
// If the textline ends a paragraph, it gets a second terminal newline.
void IterateAndAppendUTF8TextlineText(STRING *text);
// Appends the text of the current paragraph in reading order
// to the given buffer.
// Each textline is terminated in a single newline character, and the
// paragraph gets an extra newline at the end.
void AppendUTF8ParagraphText(STRING *text) const;
// Returns whether the bidi_debug flag is set to at least min_level.
bool BidiDebug(int min_level) const;
bool current_paragraph_is_ltr_;
// Is the currently pointed-at character at the beginning of
// a minor-direction run?
bool at_beginning_of_minor_run_;
// Is the currently pointed-at character in a minor-direction sequence?
bool in_minor_direction_;
};
} // namespace tesseract.
#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H__