mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
Moved ResultIterator/PageIterator to ccmain
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@645 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
8225f5b846
commit
ef786ad29b
@ -1,278 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: resultiterator.cpp
|
||||
// Description: Iterator for tesseract results that avoids using tesseract
|
||||
// internal data structures
|
||||
// Author: Ray Smith
|
||||
// Created: Fri Feb 26 14:32:09 PST 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "resultiterator.h"
|
||||
#include "allheaders.h"
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
ResultIterator::ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
|
||||
int scale, int scaled_yres,
|
||||
int rect_left, int rect_top,
|
||||
int rect_width, int rect_height)
|
||||
: PageIterator(page_res, tesseract, scale, scaled_yres,
|
||||
rect_left, rect_top, rect_width, rect_height) {
|
||||
}
|
||||
|
||||
ResultIterator::~ResultIterator() {
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// object at the given level. Use delete [] to free after use.
|
||||
char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
||||
if (it_->word() == NULL) return NULL; // Already at the end!
|
||||
STRING text;
|
||||
PAGE_RES_IT res_it(*it_);
|
||||
WERD_CHOICE* best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
case RIL_PARA:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
text += best_choice->unichar_string();
|
||||
text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
|
||||
res_it.forward();
|
||||
} while (res_it.block() == res_it.prev_block());
|
||||
break;
|
||||
case RIL_TEXTLINE:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
text += best_choice->unichar_string();
|
||||
text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
|
||||
res_it.forward();
|
||||
} while (res_it.row() == res_it.prev_row());
|
||||
break;
|
||||
case RIL_WORD:
|
||||
text = best_choice->unichar_string();
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
text = tesseract_->unicharset.id_to_unichar(
|
||||
best_choice->unichar_id(blob_index_));
|
||||
}
|
||||
int length = text.length() + 1;
|
||||
char* result = new char[length];
|
||||
strncpy(result, text.string(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the mean confidence of the current object at the given level.
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float ResultIterator::Confidence(PageIteratorLevel level) const {
|
||||
if (it_->word() == NULL) return 0.0f; // Already at the end!
|
||||
float mean_certainty = 0.0f;
|
||||
int certainty_count = 0;
|
||||
PAGE_RES_IT res_it(*it_);
|
||||
WERD_CHOICE* best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
case RIL_PARA:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
res_it.forward();
|
||||
} while (res_it.block() == res_it.prev_block());
|
||||
break;
|
||||
case RIL_TEXTLINE:
|
||||
do {
|
||||
best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
res_it.forward();
|
||||
} while (res_it.row() == res_it.prev_row());
|
||||
break;
|
||||
case RIL_WORD:
|
||||
mean_certainty += best_choice->certainty();
|
||||
++certainty_count;
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
|
||||
if (choices != NULL) {
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
|
||||
for (int blob = 0; blob < blob_index_; ++blob)
|
||||
blob_choices_it.forward();
|
||||
BLOB_CHOICE_IT choice_it(blob_choices_it.data());
|
||||
for (choice_it.mark_cycle_pt();
|
||||
!choice_it.cycled_list();
|
||||
choice_it.forward()) {
|
||||
if (choice_it.data()->unichar_id() ==
|
||||
best_choice->unichar_id(blob_index_))
|
||||
break;
|
||||
}
|
||||
mean_certainty += choice_it.data()->certainty();
|
||||
} else {
|
||||
mean_certainty += best_choice->certainty();
|
||||
}
|
||||
++certainty_count;
|
||||
}
|
||||
if (certainty_count > 0) {
|
||||
mean_certainty /= certainty_count;
|
||||
float confidence = 100 + 5 * mean_certainty;
|
||||
if (confidence < 0.0f) confidence = 0.0f;
|
||||
if (confidence > 100.0f) confidence = 100.0f;
|
||||
return confidence;
|
||||
}
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
// Returns the font attributes of the current word. If iterating at a higher
|
||||
// level object than words, eg textlines, then this will return the
|
||||
// attributes of the first word in that textline.
|
||||
// The actual return value is a string representing a font name. It points
|
||||
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
|
||||
// the iterator itself, ie rendered invalid by various members of
|
||||
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
|
||||
// Pointsize is returned in printers points (1/72 inch.)
|
||||
const char* ResultIterator::WordFontAttributes(bool* is_bold,
|
||||
bool* is_italic,
|
||||
bool* is_underlined,
|
||||
bool* is_monospace,
|
||||
bool* is_serif,
|
||||
bool* is_smallcaps,
|
||||
int* pointsize,
|
||||
int* font_id) const {
|
||||
if (it_->word() == NULL) return NULL; // Already at the end!
|
||||
*font_id = it_->word()->fontinfo_id;
|
||||
if (*font_id < 0) return NULL; // No font available.
|
||||
const UnicityTable<FontInfo> &font_table = tesseract_->get_fontinfo_table();
|
||||
FontInfo font_info = font_table.get(*font_id);
|
||||
*is_bold = font_info.is_bold();
|
||||
*is_italic = font_info.is_italic();
|
||||
*is_underlined = false; // TODO(rays) fix this!
|
||||
*is_monospace = font_info.is_fixed_pitch();
|
||||
*is_serif = font_info.is_serif();
|
||||
*is_smallcaps = it_->word()->small_caps;
|
||||
// The font size is calculated from a multiple of the x-height
|
||||
// that came from the block.
|
||||
float row_height = it_->row()->row->x_height() *
|
||||
it_->block()->block->cell_over_xheight();
|
||||
// Convert from pixels to printers points.
|
||||
*pointsize = scaled_yres_ > 0
|
||||
? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
|
||||
: 0;
|
||||
|
||||
return font_info.name;
|
||||
}
|
||||
|
||||
// Returns true if the current word was found in a dictionary.
|
||||
bool ResultIterator::WordIsFromDictionary() const {
|
||||
if (it_->word() == NULL) return false; // Already at the end!
|
||||
int permuter = it_->word()->best_choice->permuter();
|
||||
return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
|
||||
permuter == USER_DAWG_PERM;
|
||||
}
|
||||
|
||||
// Returns true if the current word is numeric.
|
||||
bool ResultIterator::WordIsNumeric() const {
|
||||
if (it_->word() == NULL) return false; // Already at the end!
|
||||
int permuter = it_->word()->best_choice->permuter();
|
||||
return permuter == NUMBER_PERM;
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a superscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool ResultIterator::SymbolIsSuperscript() const {
|
||||
if (cblob_it_ == NULL && it_->word() != NULL)
|
||||
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a subscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool ResultIterator::SymbolIsSubscript() const {
|
||||
if (cblob_it_ == NULL && it_->word() != NULL)
|
||||
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a dropcap.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool ResultIterator::SymbolIsDropcap() const {
|
||||
if (cblob_it_ == NULL && it_->word() != NULL)
|
||||
return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
|
||||
return false;
|
||||
}
|
||||
|
||||
ChoiceIterator::ChoiceIterator(const ResultIterator& result_it) {
|
||||
ASSERT_HOST(result_it.it_->word() != NULL);
|
||||
tesseract_ = result_it.tesseract_;
|
||||
PAGE_RES_IT res_it(*result_it.it_);
|
||||
WERD_CHOICE* best_choice = res_it.word()->best_choice;
|
||||
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
|
||||
if (choices != NULL) {
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
|
||||
for (int blob = 0; blob < result_it.blob_index_; ++blob)
|
||||
blob_choices_it.forward();
|
||||
choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
|
||||
choice_it_->mark_cycle_pt();
|
||||
} else {
|
||||
choice_it_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
ChoiceIterator::~ChoiceIterator() {
|
||||
delete choice_it_;
|
||||
}
|
||||
|
||||
// Moves to the next choice for the symbol and returns false if there
|
||||
// are none left.
|
||||
bool ChoiceIterator::Next() {
|
||||
if (choice_it_ == NULL)
|
||||
return false;
|
||||
choice_it_->forward();
|
||||
return !choice_it_->cycled_list();
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// choice. Use delete [] to free after use.
|
||||
const char* ChoiceIterator::GetUTF8Text() const {
|
||||
if (choice_it_ == NULL)
|
||||
return NULL;
|
||||
UNICHAR_ID id = choice_it_->data()->unichar_id();
|
||||
if (id < 0 || id >= tesseract_->unicharset.size() ||
|
||||
id == INVALID_UNICHAR_ID)
|
||||
return NULL;
|
||||
return tesseract_->unicharset.id_to_unichar(id);
|
||||
}
|
||||
|
||||
// Returns the confidence of the current choice.
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float ChoiceIterator::Confidence() const {
|
||||
if (choice_it_ == NULL)
|
||||
return 0.0f;
|
||||
float confidence = 100 + 5 * choice_it_->data()->certainty();
|
||||
if (confidence < 0.0f) confidence = 0.0f;
|
||||
if (confidence > 100.0f) confidence = 100.0f;
|
||||
return confidence;
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract.
|
@ -1,160 +0,0 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: resultiterator.h
|
||||
// Description: Iterator for tesseract results that avoids using tesseract
|
||||
// internal data structures.
|
||||
// Author: Ray Smith
|
||||
// Created: Fri Feb 26 11:01:06 PST 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_API_RESULTITERATOR_H__
|
||||
#define TESSERACT_API_RESULTITERATOR_H__
|
||||
|
||||
#include "pageiterator.h"
|
||||
|
||||
class BLOB_CHOICE_IT;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class Tesseract;
|
||||
|
||||
// Class to iterate over tesseract results, providing access to all levels
|
||||
// of the page hierarchy, without including any tesseract headers or having
|
||||
// to handle any tesseract structures.
|
||||
// WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
// therefore can only be used while the TessBaseAPI class still exists and
|
||||
// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
// DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
// See apitypes.h for the definition of PageIteratorLevel.
|
||||
// See also base class PageIterator, which contains the bulk of the interface.
|
||||
// ResultIterator adds text-specific methods for access to OCR output.
|
||||
|
||||
class ResultIterator : public PageIterator {
|
||||
friend class ChoiceIterator;
|
||||
public:
|
||||
// page_res and tesseract come directly from the BaseAPI.
|
||||
// The rectangle parameters are copied indirectly from the Thresholder,
|
||||
// via the BaseAPI. They represent the coordinates of some rectangle in an
|
||||
// original image (in top-left-origin coordinates) and therefore the top-left
|
||||
// needs to be added to any output boxes in order to specify coordinates
|
||||
// in the original image. See TessBaseAPI::SetRectangle.
|
||||
// The scale and scaled_yres are in case the Thresholder scaled the image
|
||||
// rectangle prior to thresholding. Any coordinates in tesseract's image
|
||||
// must be divided by scale before adding (rect_left, rect_top).
|
||||
// The scaled_yres indicates the effective resolution of the binary image
|
||||
// that tesseract has been given by the Thresholder.
|
||||
// After the constructor, Begin has already been called.
|
||||
ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
|
||||
int scale, int scaled_yres,
|
||||
int rect_left, int rect_top,
|
||||
int rect_width, int rect_height);
|
||||
virtual ~ResultIterator();
|
||||
|
||||
// ResultIterators may be copied! This makes it possible to iterate over
|
||||
// all the objects at a lower level, while maintaining an iterator to
|
||||
// objects at a higher level. These constructors DO NOT CALL Begin, so
|
||||
// iterations will continue from the location of src.
|
||||
// TODO: For now the copy constructor and operator= only need the base class
|
||||
// versions, but if new data members are added, don't forget to add them!
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
|
||||
// See PageIterator.
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// object at the given level. Use delete [] to free after use.
|
||||
char* GetUTF8Text(PageIteratorLevel level) const;
|
||||
|
||||
// Returns the mean confidence of the current object at the given level.
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float Confidence(PageIteratorLevel level) const;
|
||||
|
||||
// ============= Functions that refer to words only ============.
|
||||
|
||||
// Returns the font attributes of the current word. If iterating at a higher
|
||||
// level object than words, eg textlines, then this will return the
|
||||
// attributes of the first word in that textline.
|
||||
// The actual return value is a string representing a font name. It points
|
||||
// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
|
||||
// the iterator itself, ie rendered invalid by various members of
|
||||
// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
|
||||
// Pointsize is returned in printers points (1/72 inch.)
|
||||
const char* WordFontAttributes(bool* is_bold,
|
||||
bool* is_italic,
|
||||
bool* is_underlined,
|
||||
bool* is_monospace,
|
||||
bool* is_serif,
|
||||
bool* is_smallcaps,
|
||||
int* pointsize,
|
||||
int* font_id) const;
|
||||
|
||||
// Returns true if the current word was found in a dictionary.
|
||||
bool WordIsFromDictionary() const;
|
||||
|
||||
// Returns true if the current word is numeric.
|
||||
bool WordIsNumeric() const;
|
||||
|
||||
// ============= Functions that refer to symbols only ============.
|
||||
|
||||
// Returns true if the current symbol is a superscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool SymbolIsSuperscript() const;
|
||||
// Returns true if the current symbol is a subscript.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool SymbolIsSubscript() const;
|
||||
// Returns true if the current symbol is a dropcap.
|
||||
// If iterating at a higher level object than symbols, eg words, then
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool SymbolIsDropcap() const;
|
||||
};
|
||||
|
||||
// Class to iterate over the classifier choices for a single RIL_SYMBOL.
|
||||
class ChoiceIterator {
|
||||
public:
|
||||
// Construction is from a ResultIterator that points to the symbol of
|
||||
// interest. The ChoiceIterator allows a one-shot iteration over the
|
||||
// choices for this symbol and after that is is useless.
|
||||
explicit ChoiceIterator(const ResultIterator& result_it);
|
||||
~ChoiceIterator();
|
||||
|
||||
// Moves to the next choice for the symbol and returns false if there
|
||||
// are none left.
|
||||
bool Next();
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// choice.
|
||||
// NOTE: Unlike ResultIterator::GetUTF8Text, the return points to an
|
||||
// internal structure and should NOT be delete[]ed to free after use.
|
||||
const char* GetUTF8Text() const;
|
||||
|
||||
// Returns the confidence of the current choice.
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float Confidence() const;
|
||||
|
||||
private:
|
||||
// Pointer to the Tesseract object owned by the API.
|
||||
Tesseract* tesseract_;
|
||||
// Iterator over the blob choices.
|
||||
BLOB_CHOICE_IT* choice_it_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_API_RESULT_ITERATOR_H__
|
@ -36,7 +36,7 @@ PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
|
||||
rect_left_(rect_left), rect_top_(rect_top),
|
||||
rect_width_(rect_width), rect_height_(rect_height) {
|
||||
it_ = new PAGE_RES_IT(page_res);
|
||||
Begin();
|
||||
PageIterator::Begin();
|
||||
}
|
||||
|
||||
PageIterator::~PageIterator() {
|
||||
@ -73,6 +73,11 @@ const PageIterator& PageIterator::operator=(const PageIterator& src) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
|
||||
return (it_ == NULL && it_ == other) ||
|
||||
((other != NULL) && (it_ != NULL) && (*it_ == *other));
|
||||
}
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
|
||||
// Resets the iterator to point to the start of the page.
|
||||
@ -81,12 +86,38 @@ void PageIterator::Begin() {
|
||||
BeginWord(0);
|
||||
}
|
||||
|
||||
void PageIterator::RestartParagraph() {
|
||||
if (it_->block() == NULL) return; // At end of the document.
|
||||
PAGE_RES_IT para(page_res_);
|
||||
PAGE_RES_IT next_para(para);
|
||||
next_para.forward_paragraph();
|
||||
while (next_para.cmp(*it_) <= 0) {
|
||||
para = next_para;
|
||||
next_para.forward_paragraph();
|
||||
}
|
||||
*it_ = para;
|
||||
BeginWord(0);
|
||||
}
|
||||
|
||||
bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
|
||||
PageIterator p_start(*this);
|
||||
p_start.RestartParagraph();
|
||||
return p_start.it_->row() == it_->row();
|
||||
}
|
||||
|
||||
void PageIterator::RestartRow() {
|
||||
it_->restart_row();
|
||||
BeginWord(0);
|
||||
}
|
||||
|
||||
// Moves to the start of the next object at the given level in the
|
||||
// page hierarchy, and returns false if the end of the page was reached.
|
||||
// NOTE that RIL_SYMBOL will skip non-text blocks, but all other
|
||||
// PageIteratorLevel level values will visit each non-text block once.
|
||||
// Think of non text blocks as containing a single para, with a single line,
|
||||
// with a single imaginary word.
|
||||
// NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
|
||||
// non-text block at least once.
|
||||
// Think of non text blocks as containing a single para, with at least one
|
||||
// line, with a single imaginary word, containing a single symbol.
|
||||
// The bounding boxes mark out any polygonal nature of the block, and
|
||||
// PTIsTextType(BLockType()) is false for non-text blocks.
|
||||
// Calls to Next with different levels may be freely intermixed.
|
||||
// This function iterates words in right-to-left scripts correctly, if
|
||||
// the appropriate language has been loaded into Tesseract.
|
||||
@ -97,9 +128,11 @@ bool PageIterator::Next(PageIteratorLevel level) {
|
||||
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
case RIL_PARA:
|
||||
it_->forward_block();
|
||||
break;
|
||||
case RIL_PARA:
|
||||
it_->forward_paragraph();
|
||||
break;
|
||||
case RIL_TEXTLINE:
|
||||
for (it_->forward_with_empties(); it_->row() == it_->prev_row();
|
||||
it_->forward_with_empties());
|
||||
@ -112,7 +145,7 @@ bool PageIterator::Next(PageIteratorLevel level) {
|
||||
cblob_it_->forward();
|
||||
++blob_index_;
|
||||
if (blob_index_ >= word_length_)
|
||||
it_->forward();
|
||||
it_->forward_with_empties();
|
||||
else
|
||||
return true;
|
||||
break;
|
||||
@ -129,10 +162,13 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
|
||||
if (it_->word() == NULL) return true; // In an image block.
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
return blob_index_ == 0 && it_->block() != it_->prev_block();
|
||||
case RIL_PARA:
|
||||
return it_->block() != it_->prev_block();
|
||||
return blob_index_ == 0 &&
|
||||
(it_->block() != it_->prev_block() ||
|
||||
it_->row()->row->para() != it_->prev_row()->row->para());
|
||||
case RIL_TEXTLINE:
|
||||
return it_->row() != it_->prev_row();
|
||||
return blob_index_ == 0 && it_->row() != it_->prev_row();
|
||||
case RIL_WORD:
|
||||
return blob_index_ == 0;
|
||||
case RIL_SYMBOL:
|
||||
@ -145,7 +181,7 @@ bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
|
||||
// given level. (e.g. the last word in a line, the last line in a block)
|
||||
bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const {
|
||||
if (it_->word() == NULL) return true; // Already at the end!
|
||||
if (Empty(element)) return true; // Already at the end!
|
||||
// The result is true if we step forward by element and find we are
|
||||
// at the the end of the page or at beginning of *all* levels in:
|
||||
// [level, element).
|
||||
@ -154,7 +190,7 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
|
||||
// word on a line, so we also have to be at the first symbol in a word.
|
||||
PageIterator next(*this);
|
||||
next.Next(element);
|
||||
if (next.it_->word() == NULL) return true; // Reached the end of the page.
|
||||
if (next.Empty(element)) return true; // Reached the end of the page.
|
||||
while (element > level) {
|
||||
element = static_cast<PageIteratorLevel>(element - 1);
|
||||
if (!next.IsAtBeginningOf(element))
|
||||
@ -163,6 +199,21 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns whether this iterator is positioned
|
||||
// before other: -1
|
||||
// equal to other: 0
|
||||
// after other: 1
|
||||
int PageIterator::Cmp(const PageIterator &other) const {
|
||||
int word_cmp = it_->cmp(*other.it_);
|
||||
if (word_cmp != 0)
|
||||
return word_cmp;
|
||||
if (blob_index_ < other.blob_index_)
|
||||
return -1;
|
||||
if (blob_index_ == other.blob_index_)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
// Coordinate system:
|
||||
// Integer coordinates are at the cracks between the pixels.
|
||||
@ -176,22 +227,25 @@ bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
|
||||
// If an image rectangle has been set in the API, then returned coordinates
|
||||
// relate to the original (full) image, rather than the rectangle.
|
||||
|
||||
// Returns the bounding rectangle of the current object at the given level.
|
||||
// Returns the bounding rectangle of the current object at the given level in
|
||||
// the coordinates of the working image that is pix_binary().
|
||||
// See comment on coordinate system above.
|
||||
// Returns false if there is no such object at the current position.
|
||||
bool PageIterator::BoundingBox(PageIteratorLevel level,
|
||||
int* left, int* top,
|
||||
int* right, int* bottom) const {
|
||||
if (it_->block() == NULL) return false; // Already at the end!
|
||||
if (it_->word() == NULL && level != RIL_BLOCK) return false;
|
||||
if (level == RIL_SYMBOL && blob_index_ >= word_length_)
|
||||
return false; // Zero length word, or already at the end of it.
|
||||
bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
|
||||
int* left, int* top,
|
||||
int* right, int* bottom) const {
|
||||
if (Empty(level))
|
||||
return false;
|
||||
TBOX box;
|
||||
PARA *para = NULL;
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
case RIL_PARA:
|
||||
box = it_->block()->block->bounding_box();
|
||||
break;
|
||||
case RIL_PARA:
|
||||
para = it_->row()->row->para();
|
||||
if (para == NULL) return false;
|
||||
// explicit fall-through.
|
||||
case RIL_TEXTLINE:
|
||||
box = it_->row()->row->bounding_box();
|
||||
break;
|
||||
@ -204,22 +258,59 @@ bool PageIterator::BoundingBox(PageIteratorLevel level,
|
||||
else
|
||||
box = cblob_it_->data()->bounding_box();
|
||||
}
|
||||
if (level == RIL_PARA) {
|
||||
PageIterator other = *this;
|
||||
other.Begin();
|
||||
do {
|
||||
if (other.it_->row() && other.it_->row()->row &&
|
||||
other.it_->row()->row->para() == para) {
|
||||
box = box.bounding_union(other.it_->row()->row->bounding_box());
|
||||
}
|
||||
} while (other.Next(RIL_TEXTLINE));
|
||||
}
|
||||
if (level != RIL_SYMBOL || cblob_it_ != NULL)
|
||||
box.rotate(it_->block()->block->re_rotation());
|
||||
// Now we have a box in tesseract coordinates relative to the image rectangle,
|
||||
// we have to convert the coords to global page coords in a top-down system.
|
||||
*left = ClipToRange(box.left() / scale_ + rect_left_,
|
||||
// we have to convert the coords to a top-down system.
|
||||
const int pix_height = pixGetHeight(tesseract_->pix_binary());
|
||||
const int pix_width = pixGetWidth(tesseract_->pix_binary());
|
||||
*left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
|
||||
*top = ClipToRange(pix_height - box.top(), 0, pix_height);
|
||||
*right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
|
||||
*bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the bounding rectangle of the current object at the given level in
|
||||
// coordinates of the original image.
|
||||
// See comment on coordinate system above.
|
||||
// Returns false if there is no such object at the current position.
|
||||
bool PageIterator::BoundingBox(PageIteratorLevel level,
|
||||
int* left, int* top,
|
||||
int* right, int* bottom) const {
|
||||
if (!BoundingBoxInternal(level, left, top, right, bottom))
|
||||
return false;
|
||||
// Convert to the coordinate system of the original image.
|
||||
*left = ClipToRange(*left / scale_ + rect_left_,
|
||||
rect_left_, rect_left_ + rect_width_);
|
||||
*top = ClipToRange((rect_height_ - box.top()) / scale_ + rect_top_,
|
||||
*top = ClipToRange(*top / scale_ + rect_top_,
|
||||
rect_top_, rect_top_ + rect_height_);
|
||||
*right = ClipToRange((box.right() + scale_ - 1) / scale_ + rect_left_,
|
||||
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_,
|
||||
*left, rect_left_ + rect_width_);
|
||||
*bottom = ClipToRange((rect_height_ - box.bottom() + scale_ - 1) / scale_
|
||||
+ rect_top_,
|
||||
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_,
|
||||
*top, rect_top_ + rect_height_);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Return that there is no such object at a given level.
|
||||
bool PageIterator::Empty(PageIteratorLevel level) const {
|
||||
if (it_->block() == NULL) return true; // Already at the end!
|
||||
if (it_->word() == NULL && level != RIL_BLOCK) return true; // image block
|
||||
if (level == RIL_SYMBOL && blob_index_ >= word_length_)
|
||||
return true; // Zero length word, or already at the end of it.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns the type of the current block. See apitypes.h for PolyBlockType.
|
||||
PolyBlockType PageIterator::BlockType() const {
|
||||
if (it_->block() == NULL || it_->block()->block == NULL)
|
||||
@ -230,7 +321,8 @@ PolyBlockType PageIterator::BlockType() const {
|
||||
}
|
||||
|
||||
// Returns a binary image of the current object at the given level.
|
||||
// The position and size match the return from BoundingBox.
|
||||
// The position and size match the return from BoundingBoxInternal, and so this
|
||||
// could be upscaled with respect to the original input image.
|
||||
// Use pixDestroy to delete the image after use.
|
||||
// The following methods are used to generate the images:
|
||||
// RIL_BLOCK: mask the page image with the block polygon.
|
||||
@ -250,22 +342,23 @@ PolyBlockType PageIterator::BlockType() const {
|
||||
// components.
|
||||
Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
|
||||
int left, top, right, bottom;
|
||||
if (!BoundingBox(level, &left, &top, &right, &bottom))
|
||||
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
|
||||
return NULL;
|
||||
Pix* pix = NULL;
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
case RIL_PARA:
|
||||
pix = it_->block()->block->render_mask();
|
||||
// AND the mask and the image.
|
||||
pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
|
||||
PIX_SRC & PIX_DST, tesseract_->pix_binary(),
|
||||
left, top);
|
||||
break;
|
||||
case RIL_PARA:
|
||||
case RIL_TEXTLINE:
|
||||
case RIL_WORD:
|
||||
case RIL_SYMBOL:
|
||||
if (level == RIL_SYMBOL && cblob_it_ != NULL)
|
||||
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
|
||||
cblob_it_->data()->area() != 0)
|
||||
return cblob_it_->data()->render();
|
||||
// Just clip from the bounding box.
|
||||
Box* box = boxCreate(left, top, right - left, bottom - top);
|
||||
@ -301,7 +394,7 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
|
||||
Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
|
||||
Pix* grey_pix = pixClipRectangle(pix, box, NULL);
|
||||
boxDestroy(&box);
|
||||
if (level == RIL_BLOCK || level == RIL_PARA) {
|
||||
if (level == RIL_BLOCK) {
|
||||
Pix* mask = it_->block()->block->render_mask();
|
||||
Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
|
||||
pixRasterop(expanded_mask, padding, padding,
|
||||
@ -316,7 +409,6 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
|
||||
return grey_pix;
|
||||
}
|
||||
|
||||
|
||||
// Returns the baseline of the current object at the given level.
|
||||
// The baseline is the line that passes through (x1, y1) and (x2, y2).
|
||||
// WARNING: with vertical text, baselines may be vertical!
|
||||
@ -345,7 +437,7 @@ bool PageIterator::Baseline(PageIteratorLevel level,
|
||||
void PageIterator::Orientation(tesseract::Orientation *orientation,
|
||||
tesseract::WritingDirection *writing_direction,
|
||||
tesseract::TextlineOrder *textline_order,
|
||||
float *deskew_angle) {
|
||||
float *deskew_angle) const {
|
||||
BLOCK* block = it_->block()->block;
|
||||
|
||||
// Orientation
|
||||
@ -388,6 +480,22 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
|
||||
*deskew_angle = -skew.angle();
|
||||
}
|
||||
|
||||
void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
|
||||
bool *is_list_item,
|
||||
bool *is_crown,
|
||||
int *first_line_indent) const {
|
||||
*just = tesseract::JUSTIFICATION_UNKNOWN;
|
||||
if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
|
||||
!it_->row()->row->para()->model)
|
||||
return;
|
||||
|
||||
PARA *para = it_->row()->row->para();
|
||||
*is_list_item = para->is_list_item;
|
||||
*is_crown = para->is_very_first_or_continuation;
|
||||
*first_line_indent = para->model->first_indent() -
|
||||
para->model->body_indent();
|
||||
}
|
||||
|
||||
// Sets up the internal data for iterating the blobs of a new word, then
|
||||
// moves the iterator to the given offset.
|
||||
void PageIterator::BeginWord(int offset) {
|
||||
@ -404,6 +512,12 @@ void PageIterator::BeginWord(int offset) {
|
||||
// is already baseline denormalized.
|
||||
word_length_ = word_res->best_choice->length();
|
||||
ASSERT_HOST(word_res->box_word != NULL);
|
||||
if (word_res->box_word->length() != word_length_) {
|
||||
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
|
||||
word_length_, word_res->best_choice->unichar_string().string(),
|
||||
word_res->box_word->length());
|
||||
word_res->box_word->bounding_box().print();
|
||||
}
|
||||
ASSERT_HOST(word_res->box_word->length() == word_length_);
|
||||
word_ = NULL;
|
||||
// We will be iterating the box_word.
|
@ -18,10 +18,10 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_API_PAGEITERATOR_H__
|
||||
#define TESSERACT_API_PAGEITERATOR_H__
|
||||
#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
|
||||
#define TESSERACT_CCMAIN_PAGEITERATOR_H__
|
||||
|
||||
#include "apitypes.h"
|
||||
#include "publictypes.h"
|
||||
|
||||
class C_BLOB_IT;
|
||||
class PBLOB_IT;
|
||||
@ -72,10 +72,27 @@ class PageIterator {
|
||||
PageIterator(const PageIterator& src);
|
||||
const PageIterator& operator=(const PageIterator& src);
|
||||
|
||||
// Are we positioned at the same location as other?
|
||||
bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
|
||||
// Moves the iterator to point to the start of the page to begin an iteration.
|
||||
void Begin();
|
||||
virtual void Begin();
|
||||
|
||||
// Moves the iterator to the beginning of the paragraph.
|
||||
// This class implements this functionality by moving it to the zero indexed
|
||||
// blob of the first (leftmost) word on the first row of the paragraph.
|
||||
virtual void RestartParagraph();
|
||||
|
||||
// Return whether this iterator points anywhere in the first textline of a
|
||||
// paragraph.
|
||||
bool IsWithinFirstTextlineOfParagraph() const;
|
||||
|
||||
// Moves the iterator to the beginning of the text line.
|
||||
// This class implements this functionality by moving it to the zero indexed
|
||||
// blob of the first (leftmost) word of the row.
|
||||
virtual void RestartRow();
|
||||
|
||||
// Moves to the start of the next object at the given level in the
|
||||
// page hierarchy, and returns false if the end of the page was reached.
|
||||
@ -86,17 +103,43 @@ class PageIterator {
|
||||
// Calls to Next with different levels may be freely intermixed.
|
||||
// This function iterates words in right-to-left scripts correctly, if
|
||||
// the appropriate language has been loaded into Tesseract.
|
||||
bool Next(PageIteratorLevel level);
|
||||
virtual bool Next(PageIteratorLevel level);
|
||||
|
||||
// Returns true if the iterator is at the start of an object at the given
|
||||
// level. Possible uses include determining if a call to Next(RIL_WORD)
|
||||
// moved to the start of a RIL_PARA.
|
||||
bool IsAtBeginningOf(PageIteratorLevel level) const;
|
||||
// level.
|
||||
//
|
||||
// For instance, suppose an iterator it is pointed to the first symbol of the
|
||||
// first word of the third line of the second paragraph of the first block in
|
||||
// a page, then:
|
||||
// it.IsAtBeginningOf(RIL_BLOCK) = false
|
||||
// it.IsAtBeginningOf(RIL_PARA) = false
|
||||
// it.IsAtBeginningOf(RIL_TEXTLINE) = true
|
||||
// it.IsAtBeginningOf(RIL_WORD) = true
|
||||
// it.IsAtBeginningOf(RIL_SYMBOL) = true
|
||||
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
|
||||
|
||||
// Returns whether the iterator is positioned at the last element in a
|
||||
// given level. (e.g. the last word in a line, the last line in a block)
|
||||
bool IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const;
|
||||
//
|
||||
// Here's some two-paragraph example
|
||||
// text. It starts off innocuously
|
||||
// enough but quickly turns bizarre.
|
||||
// The author inserts a cornucopia
|
||||
// of words to guard against confused
|
||||
// references.
|
||||
//
|
||||
// Now take an iterator it pointed to the start of "bizarre."
|
||||
// it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false
|
||||
// it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true
|
||||
// it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false
|
||||
virtual bool IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const;
|
||||
|
||||
// Returns whether this iterator is positioned
|
||||
// before other: -1
|
||||
// equal to other: 0
|
||||
// after other: 1
|
||||
int Cmp(const PageIterator &other) const;
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
// Coordinate system:
|
||||
@ -120,12 +163,21 @@ class PageIterator {
|
||||
// the image to include more foreground pixels. See GetImage below.
|
||||
bool BoundingBox(PageIteratorLevel level,
|
||||
int* left, int* top, int* right, int* bottom) const;
|
||||
// Returns the bounding rectangle of the object in a coordinate system of the
|
||||
// working image rectangle having its origin at (rect_left_, rect_top_) with
|
||||
// respect to the original image and is scaled by a factor scale_.
|
||||
bool BoundingBoxInternal(PageIteratorLevel level,
|
||||
int* left, int* top, int* right, int* bottom) const;
|
||||
|
||||
// Returns whether there is no object of a given level.
|
||||
bool Empty(PageIteratorLevel level) const;
|
||||
|
||||
// Returns the type of the current block. See apitypes.h for PolyBlockType.
|
||||
PolyBlockType BlockType() const;
|
||||
|
||||
// Returns a binary image of the current object at the given level.
|
||||
// The position and size match the return from BoundingBox.
|
||||
// The position and size match the return from BoundingBoxInternal, and so
|
||||
// this could be upscaled with respect to the original input image.
|
||||
// Use pixDestroy to delete the image after use.
|
||||
Pix* GetBinaryImage(PageIteratorLevel level) const;
|
||||
|
||||
@ -156,7 +208,38 @@ class PageIterator {
|
||||
void Orientation(tesseract::Orientation *orientation,
|
||||
tesseract::WritingDirection *writing_direction,
|
||||
tesseract::TextlineOrder *textline_order,
|
||||
float *deskew_angle);
|
||||
float *deskew_angle) const;
|
||||
|
||||
// Returns information about the current paragraph, if available.
|
||||
//
|
||||
// justification -
|
||||
// LEFT if ragged right, or fully justified and script is left-to-right.
|
||||
// RIGHT if ragged left, or fully justified and script is right-to-left.
|
||||
// unknown if it looks like source code or we have very few lines.
|
||||
// is_list_item -
|
||||
// true if we believe this is a member of an ordered or unordered list.
|
||||
// is_crown -
|
||||
// true if the first line of the paragraph is aligned with the other
|
||||
// lines of the paragraph even though subsequent paragraphs have first
|
||||
// line indents. This typically indicates that this is the continuation
|
||||
// of a previous paragraph or that it is the very first paragraph in
|
||||
// the chapter.
|
||||
// first_line_indent -
|
||||
// For LEFT aligned paragraphs, the first text line of paragraphs of
|
||||
// this kind are indented this many pixels from the left edge of the
|
||||
// rest of the paragraph.
|
||||
// for RIGHT aligned paragraphs, the first text line of paragraphs of
|
||||
// this kind are indented this many pixels from the right edge of the
|
||||
// rest of the paragraph.
|
||||
// NOTE 1: This value may be negative.
|
||||
// NOTE 2: if *is_crown == true, the first line of this paragraph is
|
||||
// actually flush, and first_line_indent is set to the "common"
|
||||
// first_line_indent for subsequent paragraphs in this block
|
||||
// of text.
|
||||
void ParagraphInfo(tesseract::ParagraphJustification *justification,
|
||||
bool *is_list_item,
|
||||
bool *is_crown,
|
||||
int *first_line_indent) const;
|
||||
|
||||
protected:
|
||||
// Sets up the internal data for iterating the blobs of a new word, then
|
||||
@ -192,4 +275,4 @@ class PageIterator {
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_API_PAGEITERATOR_H__
|
||||
#endif // TESSERACT_CCMAIN_PAGEITERATOR_H__
|
663
ccmain/resultiterator.cpp
Normal file
663
ccmain/resultiterator.cpp
Normal file
@ -0,0 +1,663 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: resultiterator.cpp
|
||||
// Description: Iterator for tesseract results that is capable of
|
||||
// iterating in proper reading order over Bi Directional
|
||||
// (e.g. mixed Hebrew and English) text.
|
||||
// Author: David Eger
|
||||
// Created: Fri May 27 13:58:06 PST 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "resultiterator.h"
|
||||
|
||||
#include "allheaders.h"
|
||||
#include "pageres.h"
|
||||
#include "strngs.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "unicharset.h"
|
||||
#include "unicodes.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
ResultIterator::ResultIterator(const LTRResultIterator &resit)
|
||||
: LTRResultIterator(resit) {
|
||||
in_minor_direction_ = false;
|
||||
at_beginning_of_minor_run_ = false;
|
||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||
MoveToLogicalStartOfTextline();
|
||||
}
|
||||
|
||||
ResultIterator *ResultIterator::StartOfParagraph(
|
||||
const LTRResultIterator &resit) {
|
||||
return new ResultIterator(resit);
|
||||
}
|
||||
|
||||
bool ResultIterator::ParagraphIsLtr() const {
|
||||
return current_paragraph_is_ltr_;
|
||||
}
|
||||
|
||||
bool ResultIterator::CurrentParagraphIsLtr() const {
|
||||
if (!it_->word())
|
||||
return true; // doesn't matter.
|
||||
LTRResultIterator it(*this);
|
||||
it.RestartParagraph();
|
||||
// Try to figure out the ltr-ness of the paragraph. The rules below
|
||||
// make more sense in the context of a difficult paragraph example.
|
||||
// Here we denote {ltr characters, RTL CHARACTERS}:
|
||||
//
|
||||
// "don't go in there!" DAIS EH
|
||||
// EHT OTNI DEPMUJ FELSMIH NEHT DNA
|
||||
// .GNIDLIUB GNINRUB
|
||||
//
|
||||
// On the first line, the left-most word is LTR and the rightmost word
|
||||
// is RTL. Thus, we are better off taking the majority direction for
|
||||
// the whole paragraph contents. So instead of "the leftmost word is LTR"
|
||||
// indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
|
||||
// would not do: Typically an RTL paragraph would *not* start with an LTR
|
||||
// word. So our heuristics are as follows:
|
||||
//
|
||||
// (1) If the first text line has an RTL word in the left-most position
|
||||
// it is RTL.
|
||||
// (2) If the first text line has an LTR word in the right-most position
|
||||
// it is LTR.
|
||||
// (3) If neither of the above is true, take the majority count for the
|
||||
// paragraph -- if there are more rtl words, it is RTL. If there
|
||||
// are more LTR words, it's LTR.
|
||||
bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
|
||||
bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
|
||||
int num_ltr, num_rtl;
|
||||
num_rtl = leftmost_rtl ? 1 : 0;
|
||||
num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
|
||||
for (it.Next(RIL_WORD);
|
||||
!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
|
||||
it.Next(RIL_WORD)) {
|
||||
StrongScriptDirection dir = it.WordDirection();
|
||||
rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
|
||||
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
|
||||
num_ltr += rightmost_ltr ? 1 : 0;
|
||||
}
|
||||
if (leftmost_rtl)
|
||||
return false;
|
||||
if (rightmost_ltr)
|
||||
return true;
|
||||
// First line is ambiguous. Take statistics on the whole paragraph.
|
||||
if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
|
||||
StrongScriptDirection dir = it.WordDirection();
|
||||
num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
|
||||
num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
|
||||
} while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
|
||||
return num_ltr >= num_rtl;
|
||||
}
|
||||
|
||||
const int ResultIterator::kMinorRunStart = -1;
|
||||
const int ResultIterator::kMinorRunEnd = -2;
|
||||
const int ResultIterator::kComplexWord = -3;
|
||||
|
||||
void ResultIterator::CalculateBlobOrder(
|
||||
GenericVector<int> *blob_indices) const {
|
||||
bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
blob_indices->clear();
|
||||
if (Empty(RIL_WORD)) return;
|
||||
if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
|
||||
// Easy! just return the blobs in order;
|
||||
for (int i = 0; i < word_length_; i++)
|
||||
blob_indices->push_back(i);
|
||||
return;
|
||||
}
|
||||
|
||||
// The blobs are in left-to-right order, but the current reading context
|
||||
// is right-to-left.
|
||||
const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
|
||||
const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
|
||||
const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
|
||||
const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
|
||||
const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
|
||||
const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
|
||||
const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
|
||||
|
||||
// Step 1: Scan for and mark European Number sequences
|
||||
// [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
|
||||
GenericVector<int> letter_types;
|
||||
for (int i = 0; i < word_length_; i++) {
|
||||
letter_types.push_back(it_->word()->SymbolDirection(i));
|
||||
}
|
||||
// Convert a single separtor sandwiched between two EN's into an EN.
|
||||
for (int i = 0; i + 2 < word_length_; i++) {
|
||||
if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
|
||||
(letter_types[i + 1] == U_EURO_NUM_SEP ||
|
||||
letter_types[i + 1] == U_COMMON_NUM_SEP)) {
|
||||
letter_types[i + 1] = U_EURO_NUM;
|
||||
}
|
||||
}
|
||||
// Scan for sequences of European Number Terminators around ENs and convert
|
||||
// them to ENs.
|
||||
for (int i = 0; i < word_length_; i++) {
|
||||
if (letter_types[i] == U_EURO_NUM_TERM) {
|
||||
int j = i + 1;
|
||||
while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
|
||||
if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
|
||||
// The sequence [i..j] should be converted to all European Numbers.
|
||||
for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
|
||||
}
|
||||
j = i - 1;
|
||||
while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
|
||||
if (j > -1 && letter_types[j] == U_EURO_NUM) {
|
||||
// The sequence [j..i] should be converted to all European Numbers.
|
||||
for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Step 2: Convert all remaining types to either L or R.
|
||||
// Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
|
||||
// All other are R.
|
||||
for (int i = 0; i < word_length_;) {
|
||||
int ti = letter_types[i];
|
||||
if (ti == U_LTR || ti == U_EURO_NUM) {
|
||||
// Left to right sequence; scan to the end of it.
|
||||
int last_good = i;
|
||||
for (int j = i + 1; j < word_length_; j++) {
|
||||
int tj = letter_types[j];
|
||||
if (tj == U_LTR || tj == U_EURO_NUM) {
|
||||
last_good = j;
|
||||
} else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
|
||||
// do nothing.
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// [i..last_good] is the L sequence
|
||||
for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
|
||||
i = last_good + 1;
|
||||
} else {
|
||||
letter_types[i] = U_RTL;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, letter_types is entirely U_LTR or U_RTL.
|
||||
for (int i = word_length_ - 1; i >= 0;) {
|
||||
if (letter_types[i] == U_RTL) {
|
||||
blob_indices->push_back(i);
|
||||
i--;
|
||||
} else {
|
||||
// left to right sequence. scan to the beginning.
|
||||
int j = i - 1;
|
||||
for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
|
||||
// Now (j, i] is LTR
|
||||
for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(blob_indices->size() == word_length_);
|
||||
}
|
||||
|
||||
static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
|
||||
for (int i = 0; i < dirs.size(); i++) {
|
||||
switch (dirs[i]) {
|
||||
case DIR_NEUTRAL: tprintf ("N "); break;
|
||||
case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
|
||||
case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
|
||||
case DIR_MIX: tprintf("Z "); break;
|
||||
default: tprintf("? "); break;
|
||||
}
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
void ResultIterator::CalculateTextlineOrder(
|
||||
bool paragraph_is_ltr,
|
||||
const LTRResultIterator &resit,
|
||||
GenericVectorEqEq<int> *word_indices) const {
|
||||
GenericVector<StrongScriptDirection> directions;
|
||||
CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
|
||||
}
|
||||
|
||||
void ResultIterator::CalculateTextlineOrder(
|
||||
bool paragraph_is_ltr,
|
||||
const LTRResultIterator &resit,
|
||||
GenericVector<StrongScriptDirection> *dirs_arg,
|
||||
GenericVectorEqEq<int> *word_indices) const {
|
||||
GenericVector<StrongScriptDirection> dirs;
|
||||
GenericVector<StrongScriptDirection> *directions;
|
||||
directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
|
||||
directions->truncate(0);
|
||||
|
||||
// A LTRResultIterator goes strictly left-to-right word order.
|
||||
LTRResultIterator ltr_it(resit);
|
||||
ltr_it.RestartRow();
|
||||
if (ltr_it.Empty(RIL_WORD)) return;
|
||||
do {
|
||||
directions->push_back(ltr_it.WordDirection());
|
||||
} while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
|
||||
|
||||
word_indices->truncate(0);
|
||||
CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
|
||||
}
|
||||
|
||||
void ResultIterator::CalculateTextlineOrder(
|
||||
bool paragraph_is_ltr,
|
||||
const GenericVector<StrongScriptDirection> &word_dirs,
|
||||
GenericVectorEqEq<int> *reading_order) {
|
||||
reading_order->truncate(0);
|
||||
if (word_dirs.size() == 0) return;
|
||||
|
||||
// Take all of the runs of minor direction words and insert them
|
||||
// in reverse order.
|
||||
int minor_direction, major_direction, major_step, start, end;
|
||||
if (paragraph_is_ltr) {
|
||||
start = 0;
|
||||
end = word_dirs.size();
|
||||
major_step = 1;
|
||||
major_direction = DIR_LEFT_TO_RIGHT;
|
||||
minor_direction = DIR_RIGHT_TO_LEFT;
|
||||
} else {
|
||||
start = word_dirs.size() - 1;
|
||||
end = -1;
|
||||
major_step = -1;
|
||||
major_direction = DIR_RIGHT_TO_LEFT;
|
||||
minor_direction = DIR_LEFT_TO_RIGHT;
|
||||
// Special rule: if there are neutral words at the right most side
|
||||
// of a line adjacent to a left-to-right word in the middle of the
|
||||
// line, we interpret the end of the line as a single LTR sequence.
|
||||
if (word_dirs[start] == DIR_NEUTRAL) {
|
||||
int neutral_end = start;
|
||||
while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
|
||||
neutral_end--;
|
||||
}
|
||||
if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
|
||||
// LTR followed by neutrals.
|
||||
// Scan for the beginning of the minor left-to-right run.
|
||||
int left = neutral_end;
|
||||
for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
|
||||
if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
|
||||
}
|
||||
reading_order->push_back(kMinorRunStart);
|
||||
for (int i = left; i < word_dirs.size(); i++) {
|
||||
reading_order->push_back(i);
|
||||
if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
|
||||
}
|
||||
reading_order->push_back(kMinorRunEnd);
|
||||
start = left - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i = start; i != end;) {
|
||||
if (word_dirs[i] == minor_direction) {
|
||||
int j = i;
|
||||
while (j != end && word_dirs[j] != major_direction)
|
||||
j += major_step;
|
||||
if (j == end) j -= major_step;
|
||||
while (j != i && word_dirs[j] != minor_direction)
|
||||
j -= major_step;
|
||||
// [j..i] is a minor direction run.
|
||||
reading_order->push_back(kMinorRunStart);
|
||||
for (int k = j; k != i; k -= major_step) {
|
||||
reading_order->push_back(k);
|
||||
}
|
||||
reading_order->push_back(i);
|
||||
reading_order->push_back(kMinorRunEnd);
|
||||
i = j + major_step;
|
||||
} else {
|
||||
reading_order->push_back(i);
|
||||
if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
|
||||
i += major_step;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int ResultIterator::LTRWordIndex() const {
|
||||
int this_word_index = 0;
|
||||
LTRResultIterator textline(*this);
|
||||
textline.RestartRow();
|
||||
while (!textline.PositionedAtSameWord(it_)) {
|
||||
this_word_index++;
|
||||
textline.Next(RIL_WORD);
|
||||
}
|
||||
return this_word_index;
|
||||
}
|
||||
|
||||
void ResultIterator::MoveToLogicalStartOfWord() {
|
||||
if (word_length_ == 0) {
|
||||
BeginWord(0);
|
||||
return;
|
||||
}
|
||||
GenericVector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
if (blob_order.size() == 0 || blob_order[0] == 0) return;
|
||||
BeginWord(blob_order[0]);
|
||||
}
|
||||
|
||||
bool ResultIterator::IsAtFinalSymbolOfWord() const {
|
||||
if (!it_->word()) return true;
|
||||
GenericVector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
return blob_order.size() == 0 || blob_order.back() == blob_index_;
|
||||
}
|
||||
|
||||
bool ResultIterator::IsAtFirstSymbolOfWord() const {
|
||||
if (!it_->word()) return true;
|
||||
GenericVector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
return blob_order.size() == 0 || blob_order[0] == blob_index_;
|
||||
}
|
||||
|
||||
void ResultIterator::AppendSuffixMarks(STRING *text) const {
|
||||
if (!it_->word()) return;
|
||||
bool reading_direction_is_ltr =
|
||||
current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
// scan forward to see what meta-information the word ordering algorithm
|
||||
// left us.
|
||||
// If this word is at the *end* of a minor run, insert the other
|
||||
// direction's mark; else if this was a complex word, insert the
|
||||
// current reading order's mark.
|
||||
GenericVectorEqEq<int> textline_order;
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_,
|
||||
*this, &textline_order);
|
||||
int this_word_index = LTRWordIndex();
|
||||
int i = textline_order.get_index(this_word_index);
|
||||
if (i < 0) return;
|
||||
|
||||
int last_non_word_mark = 0;
|
||||
for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
|
||||
last_non_word_mark = textline_order[i];
|
||||
}
|
||||
if (last_non_word_mark == kComplexWord) {
|
||||
*text += reading_direction_is_ltr ? kLRM : kRLM;
|
||||
} else if (last_non_word_mark == kMinorRunEnd) {
|
||||
if (current_paragraph_is_ltr_) {
|
||||
*text += kRLM;
|
||||
*text += kLRM;
|
||||
} else {
|
||||
*text += kRLM;
|
||||
*text += kLRM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ResultIterator::MoveToLogicalStartOfTextline() {
|
||||
GenericVectorEqEq<int> word_indices;
|
||||
RestartRow();
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_,
|
||||
dynamic_cast<const LTRResultIterator&>(*this),
|
||||
&word_indices);
|
||||
int i = 0;
|
||||
for (; i < word_indices.size() && word_indices[i] < 0; i++) {
|
||||
if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
|
||||
else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
|
||||
}
|
||||
if (in_minor_direction_) at_beginning_of_minor_run_ = true;
|
||||
if (i >= word_indices.size()) return;
|
||||
int first_word_index = word_indices[i];
|
||||
for (int j = 0; j < first_word_index; j++) {
|
||||
PageIterator::Next(RIL_WORD);
|
||||
}
|
||||
MoveToLogicalStartOfWord();
|
||||
}
|
||||
|
||||
void ResultIterator::Begin() {
|
||||
LTRResultIterator::Begin();
|
||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||
in_minor_direction_ = false;
|
||||
at_beginning_of_minor_run_ = false;
|
||||
MoveToLogicalStartOfTextline();
|
||||
}
|
||||
|
||||
bool ResultIterator::Next(PageIteratorLevel level) {
|
||||
if (it_->block() == NULL) return false; // already at end!
|
||||
switch (level) {
|
||||
case RIL_BLOCK: // explicit fall-through
|
||||
case RIL_PARA: // explicit fall-through
|
||||
case RIL_TEXTLINE:
|
||||
if (!PageIterator::Next(level)) return false;
|
||||
if (IsWithinFirstTextlineOfParagraph()) {
|
||||
// if we've advanced to a new paragraph,
|
||||
// recalculate current_paragraph_is_ltr_
|
||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||
}
|
||||
in_minor_direction_ = false;
|
||||
MoveToLogicalStartOfTextline();
|
||||
return it_->block() != NULL;
|
||||
case RIL_SYMBOL:
|
||||
{
|
||||
GenericVector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
int next_blob = 0;
|
||||
while (next_blob < blob_order.size() &&
|
||||
blob_index_ != blob_order[next_blob])
|
||||
next_blob++;
|
||||
next_blob++;
|
||||
if (next_blob < blob_order.size()) {
|
||||
// we're in the same word; simply advance one blob.
|
||||
BeginWord(blob_order[next_blob]);
|
||||
at_beginning_of_minor_run_ = false;
|
||||
return true;
|
||||
}
|
||||
level = RIL_WORD; // we've fallen through to the next word.
|
||||
}
|
||||
case RIL_WORD: // explicit fall-through.
|
||||
{
|
||||
if (it_->word() == NULL) return Next(RIL_BLOCK);
|
||||
GenericVectorEqEq<int> word_indices;
|
||||
int this_word_index = LTRWordIndex();
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_,
|
||||
*this,
|
||||
&word_indices);
|
||||
int final_real_index = word_indices.size() - 1;
|
||||
while (final_real_index > 0 && word_indices[final_real_index] < 0)
|
||||
final_real_index--;
|
||||
for (int i = 0; i < final_real_index; i++) {
|
||||
if (word_indices[i] == this_word_index) {
|
||||
int j = i + 1;
|
||||
for (; j < final_real_index && word_indices[j] < 0; j++) {
|
||||
if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
|
||||
if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
|
||||
}
|
||||
at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
|
||||
// awesome, we move to word_indices[j]
|
||||
if (BidiDebug(3)) {
|
||||
tprintf("Next(RIL_WORD): %d -> %d\n",
|
||||
this_word_index, word_indices[j]);
|
||||
}
|
||||
PageIterator::RestartRow();
|
||||
for (int k = 0; k < word_indices[j]; k++) {
|
||||
PageIterator::Next(RIL_WORD);
|
||||
}
|
||||
MoveToLogicalStartOfWord();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (BidiDebug(3)) {
|
||||
tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
|
||||
}
|
||||
// we're going off the end of the text line.
|
||||
return Next(RIL_TEXTLINE);
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(false); // shouldn't happen.
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
|
||||
if (it_->block() == NULL) return false; // Already at the end!
|
||||
if (it_->word() == NULL) return true; // In an image block.
|
||||
if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
|
||||
|
||||
bool at_word_start = IsAtFirstSymbolOfWord();
|
||||
if (level == RIL_WORD) return at_word_start;
|
||||
|
||||
ResultIterator line_start(*this);
|
||||
// move to the first word in the line...
|
||||
line_start.MoveToLogicalStartOfTextline();
|
||||
|
||||
bool at_textline_start = at_word_start && *line_start.it_ == *it_;
|
||||
if (level == RIL_TEXTLINE) return at_textline_start;
|
||||
|
||||
// now we move to the left-most word...
|
||||
line_start.RestartRow();
|
||||
bool at_block_start = at_textline_start &&
|
||||
line_start.it_->block() != line_start.it_->prev_block();
|
||||
if (level == RIL_BLOCK) return at_block_start;
|
||||
|
||||
bool at_para_start = at_block_start ||
|
||||
(at_textline_start &&
|
||||
line_start.it_->row()->row->para() !=
|
||||
line_start.it_->prev_row()->row->para());
|
||||
if (level == RIL_PARA) return at_para_start;
|
||||
|
||||
ASSERT_HOST(false); // shouldn't happen.
|
||||
return false;
|
||||
}
|
||||
|
||||
// NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
|
||||
// change that the variable next is now a ResultIterator instead of a
|
||||
// PageIterator.
|
||||
bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const {
|
||||
if (Empty(element)) return true; // Already at the end!
|
||||
// The result is true if we step forward by element and find we are
|
||||
// at the the end of the page or at beginning of *all* levels in:
|
||||
// [level, element).
|
||||
// When there is more than one level difference between element and level,
|
||||
// we could for instance move forward one symbol and still be at the first
|
||||
// word on a line, so we also have to be at the first symbol in a word.
|
||||
ResultIterator next(*this);
|
||||
next.Next(element);
|
||||
if (next.Empty(element)) return true; // Reached the end of the page.
|
||||
while (element > level) {
|
||||
element = static_cast<PageIteratorLevel>(element - 1);
|
||||
if (!next.IsAtBeginningOf(element))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// object at the given level. Use delete [] to free after use.
|
||||
char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
||||
if (it_->word() == NULL) return NULL; // Already at the end!
|
||||
STRING text;
|
||||
switch (level) {
|
||||
case RIL_BLOCK:
|
||||
{
|
||||
ResultIterator pp(*this);
|
||||
do {
|
||||
pp.AppendUTF8ParagraphText(&text);
|
||||
} while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
|
||||
}
|
||||
break;
|
||||
case RIL_PARA:
|
||||
AppendUTF8ParagraphText(&text);
|
||||
break;
|
||||
case RIL_TEXTLINE:
|
||||
{
|
||||
ResultIterator it(*this);
|
||||
it.MoveToLogicalStartOfTextline();
|
||||
it.IterateAndAppendUTF8TextlineText(&text);
|
||||
}
|
||||
break;
|
||||
case RIL_WORD:
|
||||
AppendUTF8WordText(&text);
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
{
|
||||
bool reading_direction_is_ltr =
|
||||
current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
if (at_beginning_of_minor_run_) {
|
||||
text += reading_direction_is_ltr ? kLRM : kRLM;
|
||||
}
|
||||
text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
|
||||
if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
|
||||
}
|
||||
break;
|
||||
}
|
||||
int length = text.length() + 1;
|
||||
char* result = new char[length];
|
||||
strncpy(result, text.string(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
void ResultIterator::AppendUTF8WordText(STRING *text) const {
|
||||
if (!it_->word()) return;
|
||||
ASSERT_HOST(it_->word()->best_choice != NULL);
|
||||
bool reading_direction_is_ltr =
|
||||
current_paragraph_is_ltr_ ^ in_minor_direction_;
|
||||
if (at_beginning_of_minor_run_) {
|
||||
*text += reading_direction_is_ltr ? kLRM : kRLM;
|
||||
}
|
||||
|
||||
GenericVector<int> blob_order;
|
||||
CalculateBlobOrder(&blob_order);
|
||||
for (int i = 0; i < blob_order.size(); i++) {
|
||||
*text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
|
||||
}
|
||||
AppendSuffixMarks(text);
|
||||
}
|
||||
|
||||
void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
|
||||
if (Empty(RIL_WORD)) {
|
||||
Next(RIL_WORD);
|
||||
return;
|
||||
}
|
||||
if (BidiDebug(1)) {
|
||||
GenericVectorEqEq<int> textline_order;
|
||||
GenericVector<StrongScriptDirection> dirs;
|
||||
CalculateTextlineOrder(current_paragraph_is_ltr_,
|
||||
*this, &dirs, &textline_order);
|
||||
tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
|
||||
current_paragraph_is_ltr_ ? "ltr" : "rtl");
|
||||
PrintScriptDirs(dirs);
|
||||
tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
|
||||
current_paragraph_is_ltr_ ? "ltr" : "rtl");
|
||||
for (int i = 0; i < textline_order.size(); i++) {
|
||||
tprintf("%d ", textline_order[i]);
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
int words_appended = 0;
|
||||
do {
|
||||
AppendUTF8WordText(text);
|
||||
words_appended++;
|
||||
*text += " ";
|
||||
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
|
||||
if (BidiDebug(1)) {
|
||||
tprintf("%d words printed\n", words_appended);
|
||||
}
|
||||
text->truncate_at(text->length() - 1);
|
||||
*text += line_separator_;
|
||||
// If we just finished a paragraph, add an extra newline.
|
||||
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
|
||||
*text += paragraph_separator_;
|
||||
}
|
||||
|
||||
void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
|
||||
ResultIterator it(*this);
|
||||
it.RestartParagraph();
|
||||
it.MoveToLogicalStartOfTextline();
|
||||
if (it.Empty(RIL_WORD)) return;
|
||||
do {
|
||||
it.IterateAndAppendUTF8TextlineText(text);
|
||||
} while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
|
||||
}
|
||||
|
||||
bool ResultIterator::BidiDebug(int min_level) const {
|
||||
int debug_level = 1;
|
||||
IntParam *p = ParamUtils::FindParam<IntParam>(
|
||||
"bidi_debug", GlobalParams()->int_params,
|
||||
tesseract_->params()->int_params);
|
||||
if (p != NULL) debug_level = (inT32)(*p);
|
||||
return debug_level >= min_level;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
198
ccmain/resultiterator.h
Normal file
198
ccmain/resultiterator.h
Normal file
@ -0,0 +1,198 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: resultiterator.h
|
||||
// Description: Iterator for tesseract results that is capable of
|
||||
// iterating in proper reading order over Bi Directional
|
||||
// (e.g. mixed Hebrew and English) text.
|
||||
// Author: David Eger
|
||||
// Created: Fri May 27 13:58:06 PST 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__
|
||||
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H__
|
||||
|
||||
#include "ltrresultiterator.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
class BLOB_CHOICE_IT;
|
||||
class WERD_RES;
|
||||
class STRING;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class Tesseract;
|
||||
|
||||
class ResultIterator : public LTRResultIterator {
|
||||
public:
|
||||
static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
|
||||
|
||||
// ResultIterator is copy constructible!
|
||||
// The default copy constructor works just fine for us.
|
||||
virtual ~ResultIterator() {}
|
||||
|
||||
// ============= Moving around within the page ============.
|
||||
// Moves the iterator to point to the start of the page to begin an iteration.
|
||||
virtual void Begin();
|
||||
|
||||
// Moves to the start of the next object at the given level in the
|
||||
// page hierarchy in the appropriate reading order and returns false if
|
||||
// the end of the page was reached.
|
||||
// NOTE that RIL_SYMBOL will skip non-text blocks, but all other
|
||||
// PageIteratorLevel level values will visit each non-text block once.
|
||||
// Think of non text blocks as containing a single para, with a single line,
|
||||
// with a single imaginary word.
|
||||
// Calls to Next with different levels may be freely intermixed.
|
||||
// This function iterates words in right-to-left scripts correctly, if
|
||||
// the appropriate language has been loaded into Tesseract.
|
||||
virtual bool Next(PageIteratorLevel level);
|
||||
|
||||
// IsAtBeginningOf() returns whether we're at the logical beginning of the
|
||||
// given level. (as opposed to ResultIterator's left-to-right top-to-bottom
|
||||
// order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf().
|
||||
// For a full description, see pageiterator.h
|
||||
virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
|
||||
|
||||
// Implement PageIterator's IsAtFinalElement correctly in a BiDi context.
|
||||
// For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we
|
||||
// point at the last word in a paragraph. See PageIterator for full comment.
|
||||
virtual bool IsAtFinalElement(PageIteratorLevel level,
|
||||
PageIteratorLevel element) const;
|
||||
|
||||
// ============= Accessing data ==============.
|
||||
|
||||
// Returns the null terminated UTF-8 encoded text string for the current
|
||||
// object at the given level. Use delete [] to free after use.
|
||||
virtual char* GetUTF8Text(PageIteratorLevel level) const;
|
||||
|
||||
// Return whether the current paragraph's dominant reading direction
|
||||
// is left-to-right (as opposed to right-to-left).
|
||||
bool ParagraphIsLtr() const;
|
||||
|
||||
// ============= Exposed only for testing =============.
|
||||
|
||||
// Yields the reading order as a sequence of indices and (optional)
|
||||
// meta-marks for a set of words (given left-to-right).
|
||||
// The meta marks are passed as negative values:
|
||||
// kMinorRunStart Start of minor direction text.
|
||||
// kMinorRunEnd End of minor direction text.
|
||||
// kComplexWord The next indexed word contains both left-to-right and
|
||||
// right-to-left characters and was treated as neutral.
|
||||
//
|
||||
// For example, suppose we have five words in a text line,
|
||||
// indexed [0,1,2,3,4] from the leftmost side of the text line.
|
||||
// The following are all believable reading_orders:
|
||||
//
|
||||
// Left-to-Right (in ltr paragraph):
|
||||
// { 0, 1, 2, 3, 4 }
|
||||
// Left-to-Right (in rtl paragraph):
|
||||
// { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd }
|
||||
// Right-to-Left (in rtl paragraph):
|
||||
// { 4, 3, 2, 1, 0 }
|
||||
// Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph:
|
||||
// { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }
|
||||
static void CalculateTextlineOrder(
|
||||
bool paragraph_is_ltr,
|
||||
const GenericVector<StrongScriptDirection> &word_dirs,
|
||||
GenericVectorEqEq<int> *reading_order);
|
||||
|
||||
static const int kMinorRunStart;
|
||||
static const int kMinorRunEnd;
|
||||
static const int kComplexWord;
|
||||
|
||||
protected:
|
||||
// We presume the data associated with the given iterator will outlive us.
|
||||
// NB: This is private because it does something that is non-obvious:
|
||||
// it resets to the beginning of the paragraph instead of staying wherever
|
||||
// resit might have pointed.
|
||||
explicit ResultIterator(const LTRResultIterator &resit);
|
||||
|
||||
private:
|
||||
// Calculates the current paragraph's dominant writing direction.
|
||||
// Typically, members should use current_paragraph_ltr_ instead.
|
||||
bool CurrentParagraphIsLtr() const;
|
||||
|
||||
// Returns word indices as measured from resit->RestartRow() = index 0
|
||||
// for the reading order of words within a textline given an iterator
|
||||
// into the middle of the text line.
|
||||
// In addition to non-negative word indices, the following negative values
|
||||
// may be inserted:
|
||||
// kMinorRunStart Start of minor direction text.
|
||||
// kMinorRunEnd End of minor direction text.
|
||||
// kComplexWord The previous word contains both left-to-right and
|
||||
// right-to-left characters and was treated as neutral.
|
||||
void CalculateTextlineOrder(bool paragraph_is_ltr,
|
||||
const LTRResultIterator &resit,
|
||||
GenericVectorEqEq<int> *indices) const;
|
||||
// Same as above, but the caller's ssd gets filled in if ssd != NULL.
|
||||
void CalculateTextlineOrder(bool paragraph_is_ltr,
|
||||
const LTRResultIterator &resit,
|
||||
GenericVector<StrongScriptDirection> *ssd,
|
||||
GenericVectorEqEq<int> *indices) const;
|
||||
|
||||
// What is the index of the current word in a strict left-to-right reading
|
||||
// of the row?
|
||||
int LTRWordIndex() const;
|
||||
|
||||
// Given an iterator pointing at a word, returns the logical reading order
|
||||
// of blob indices for the word.
|
||||
void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
|
||||
|
||||
// Precondition: current_paragraph_is_ltr_ is set.
|
||||
void MoveToLogicalStartOfTextline();
|
||||
|
||||
// Precondition: current_paragraph_is_ltr_ and in_minor_direction_ are set.
|
||||
void MoveToLogicalStartOfWord();
|
||||
|
||||
// Are we pointing at the final (reading order) symbol of the word?
|
||||
bool IsAtFinalSymbolOfWord() const;
|
||||
|
||||
// Are we pointing at the first (reading order) symbol of the word?
|
||||
bool IsAtFirstSymbolOfWord() const;
|
||||
|
||||
// Append any extra marks that should be appended to this word when printed.
|
||||
// Mostly, these are Unicode BiDi control characters.
|
||||
void AppendSuffixMarks(STRING *text) const;
|
||||
|
||||
// Appends the current word in reading order to the given buffer.
|
||||
void AppendUTF8WordText(STRING *text) const;
|
||||
|
||||
// Appends the text of the current text line, *assuming this iterator is
|
||||
// positioned at the beginning of the text line* This function
|
||||
// updates the iterator to point to the first position past the text line.
|
||||
// Each textline is terminated in a single newline character.
|
||||
// If the textline ends a paragraph, it gets a second terminal newline.
|
||||
void IterateAndAppendUTF8TextlineText(STRING *text);
|
||||
|
||||
// Appends the text of the current paragraph in reading order
|
||||
// to the given buffer.
|
||||
// Each textline is terminated in a single newline character, and the
|
||||
// paragraph gets an extra newline at the end.
|
||||
void AppendUTF8ParagraphText(STRING *text) const;
|
||||
|
||||
// Returns whether the bidi_debug flag is set to at least min_level.
|
||||
bool BidiDebug(int min_level) const;
|
||||
|
||||
bool current_paragraph_is_ltr_;
|
||||
|
||||
// Is the currently pointed-at character at the beginning of
|
||||
// a minor-direction run?
|
||||
bool at_beginning_of_minor_run_;
|
||||
|
||||
// Is the currently pointed-at character in a minor-direction sequence?
|
||||
bool in_minor_direction_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H__
|
Loading…
Reference in New Issue
Block a user