Provide better paragraph segmentation without having to run fully

automatic layout analysis.



git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@725 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
david.eger@gmail.com 2012-05-10 00:03:34 +00:00
parent e606c311f5
commit eeeb4f513c
4 changed files with 104 additions and 28 deletions

View File

@ -632,10 +632,12 @@ PageIterator* TessBaseAPI::AnalyseLayout() {
if (block_list_->empty())
return NULL; // The page was empty.
page_res_ = new PAGE_RES(block_list_, NULL);
return new PageIterator(page_res_, tesseract_,
thresholder_->GetScaleFactor(),
thresholder_->GetScaledYResolution(),
rect_left_, rect_top_, rect_width_, rect_height_);
DetectParagraphs(false);
return new PageIterator(
page_res_, tesseract_,
thresholder_->GetScaleFactor(),
thresholder_->GetScaledYResolution(),
rect_left_, rect_top_, rect_width_, rect_height_);
}
return NULL;
}
@ -692,9 +694,7 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
} else {
// Now run the main recognition.
if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
int paragraph_debug_level = 0;
GetIntVariable("paragraph_debug_level", &paragraph_debug_level);
DetectParagraphs(paragraph_debug_level);
DetectParagraphs(true);
} else {
result = -1;
}
@ -1926,13 +1926,16 @@ PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
return pass1_result;
}
void TessBaseAPI::DetectParagraphs(int debug_level) {
void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
int debug_level = 0;
GetIntVariable("paragraph_debug_level", &debug_level);
if (paragraph_models_ == NULL)
paragraph_models_ = new GenericVector<ParagraphModel*>;
MutableIterator *result_it = GetMutableIterator();
do { // Detect paragraphs for this block
GenericVector<ParagraphModel *> models;
::tesseract::DetectParagraphs(debug_level, result_it, &models);
::tesseract::DetectParagraphs(debug_level, after_text_recognition,
result_it, &models);
*paragraph_models_ += models;
} while (result_it->Next(RIL_BLOCK));
delete result_it;

View File

@ -732,8 +732,7 @@ class TESS_API TessBaseAPI {
TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list, PAGE_RES* pass1_result);
//// paragraphs.cpp ////////////////////////////////////////////////////
/** After text is recognized, break each paragraph into blocks. */
TESS_LOCAL void DetectParagraphs(int debug_level);
TESS_LOCAL void DetectParagraphs(bool after_text_recognition);
/**
* Extract the OCR results, costs (penalty points for uncertainty),

View File

@ -2302,9 +2302,60 @@ void DetectParagraphs(int debug_level,
// ============ Code interfacing with the rest of Tesseract ==================
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
RowInfo *info) {
// Set up text, lword_text, and rword_text (mostly for debug printing).
STRING fake_text;
PageIterator pit(static_cast<const PageIterator&>(it));
bool first_word = true;
if (!pit.Empty(RIL_WORD)) {
do {
fake_text += "x";
if (first_word) info->lword_text += "x";
info->rword_text += "x";
if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
fake_text += " ";
info->rword_text = "";
first_word = false;
}
} while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
pit.Next(RIL_SYMBOL));
}
if (fake_text.size() == 0) return;
int lspaces = info->pix_ldistance / info->average_interword_space;
for (int i = 0; i < lspaces; i++) {
info->text += ' ';
}
info->text += fake_text;
// Set up lword_box, rword_box, and num_words.
PAGE_RES_IT page_res_it = *it.PageResIt();
WERD_RES *word_res = page_res_it.restart_row();
ROW_RES *this_row = page_res_it.row();
WERD_RES *lword = NULL;
WERD_RES *rword = NULL;
info->num_words = 0;
do {
if (word_res) {
if (!lword) lword = word_res;
if (rword != word_res) info->num_words++;
rword = word_res;
}
word_res = page_res_it.forward();
} while (page_res_it.row() == this_row);
info->lword_box = lword->word->bounding_box();
info->rword_box = rword->word->bounding_box();
}
// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
// detector RowInfo with all relevant information from the row.
void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
void InitializeRowInfo(bool after_recognition,
const MutableIterator &it,
RowInfo *info) {
if (it.PageResIt()->row() != NULL) {
ROW *row = it.PageResIt()->row()->row;
info->pix_ldistance = row->lmargin();
@ -2324,6 +2375,20 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
info->ltr = true;
}
info->num_words = 0;
info->lword_indicates_list_item = false;
info->lword_likely_starts_idea = false;
info->lword_likely_ends_idea = false;
info->rword_indicates_list_item = false;
info->rword_likely_starts_idea = false;
info->rword_likely_ends_idea = false;
info->has_leaders = false;
info->ltr = 1;
if (!after_recognition) {
InitializeTextAndBoxesPreRecognition(it, info);
return;
}
info->text = "";
char *text = it.GetUTF8Text(RIL_TEXTLINE);
int trailing_ws_idx = strlen(text); // strip trailing space
@ -2341,28 +2406,17 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
}
delete []text;
info->num_words = 0;
info->lword_indicates_list_item = false;
info->lword_likely_starts_idea = false;
info->lword_likely_ends_idea = false;
info->rword_indicates_list_item = false;
info->rword_likely_starts_idea = false;
info->rword_likely_ends_idea = false;
if (info->text.size() == 0) {
info->rword_likely_ends_idea = false;
info->rword_likely_ends_idea = false;
return;
}
int ltr = 0;
int rtl = 0;
PAGE_RES_IT page_res_it = *it.PageResIt();
GenericVector<WERD_RES *> werds;
WERD_RES *word_res = page_res_it.restart_row();
ROW_RES *this_row = page_res_it.row();
int num_leaders = 0;
int ltr = 0;
int rtl = 0;
do {
if (word_res && word_res->best_choice->unichar_string().length() > 0) {
werds.push_back(word_res);
@ -2372,7 +2426,7 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
}
word_res = page_res_it.forward();
} while (page_res_it.row() == this_row);
info->ltr = ltr >= rtl;
info->has_leaders = num_leaders > 3;
info->num_words = werds.size();
if (werds.size() > 0) {
@ -2392,13 +2446,13 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
&info->rword_likely_starts_idea,
&info->rword_likely_ends_idea);
}
info->ltr = ltr >= rtl;
}
// This is called after rows have been identified and words are recognized.
// Much of this could be implemented before word recognition, but text helps
// to identify bulleted lists and gives good signals for sentence boundaries.
void DetectParagraphs(int debug_level,
bool after_text_recognition,
const MutableIterator *block_start,
GenericVector<ParagraphModel *> *models) {
// Clear out any preconceived notions.
@ -2422,10 +2476,29 @@ void DetectParagraphs(int debug_level,
row.PageResIt()->row()->row->set_para(NULL);
row_infos.push_back(RowInfo());
RowInfo &ri = row_infos.back();
InitializeRowInfo(row, &ri);
InitializeRowInfo(after_text_recognition, row, &ri);
} while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
row.Next(RIL_TEXTLINE));
// If we're called before text recognition, we might not have
// tight block bounding boxes, so trim by the minimum on each side.
if (row_infos.size() > 0) {
int min_lmargin = row_infos[0].pix_ldistance;
int min_rmargin = row_infos[0].pix_rdistance;
for (int i = 1; i < row_infos.size(); i++) {
if (row_infos[i].pix_ldistance < min_lmargin)
min_lmargin = row_infos[i].pix_ldistance;
if (row_infos[i].pix_rdistance < min_rmargin)
min_rmargin = row_infos[i].pix_rdistance;
}
if (min_lmargin > 0 || min_rmargin > 0) {
for (int i = 0; i < row_infos.size(); i++) {
row_infos[i].pix_ldistance -= min_lmargin;
row_infos[i].pix_rdistance -= min_rmargin;
}
}
}
// Run the paragraph detection algorithm.
GenericVector<PARA *> row_owners;
GenericVector<PARA *> the_paragraphs;

View File

@ -99,6 +99,7 @@ void DetectParagraphs(int debug_level,
// We use unicharset during the function to answer questions such as "is the
// first letter of this word upper case?"
void DetectParagraphs(int debug_level,
bool after_text_recognition,
const MutableIterator *block_start,
GenericVector<ParagraphModel *> *models);