mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Provide better paragraph segmentation without having to run fully
automatic layout analysis. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@725 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
e606c311f5
commit
eeeb4f513c
@ -632,10 +632,12 @@ PageIterator* TessBaseAPI::AnalyseLayout() {
|
||||
if (block_list_->empty())
|
||||
return NULL; // The page was empty.
|
||||
page_res_ = new PAGE_RES(block_list_, NULL);
|
||||
return new PageIterator(page_res_, tesseract_,
|
||||
thresholder_->GetScaleFactor(),
|
||||
thresholder_->GetScaledYResolution(),
|
||||
rect_left_, rect_top_, rect_width_, rect_height_);
|
||||
DetectParagraphs(false);
|
||||
return new PageIterator(
|
||||
page_res_, tesseract_,
|
||||
thresholder_->GetScaleFactor(),
|
||||
thresholder_->GetScaledYResolution(),
|
||||
rect_left_, rect_top_, rect_width_, rect_height_);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
@ -692,9 +694,7 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
|
||||
} else {
|
||||
// Now run the main recognition.
|
||||
if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
|
||||
int paragraph_debug_level = 0;
|
||||
GetIntVariable("paragraph_debug_level", ¶graph_debug_level);
|
||||
DetectParagraphs(paragraph_debug_level);
|
||||
DetectParagraphs(true);
|
||||
} else {
|
||||
result = -1;
|
||||
}
|
||||
@ -1926,13 +1926,16 @@ PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
|
||||
return pass1_result;
|
||||
}
|
||||
|
||||
void TessBaseAPI::DetectParagraphs(int debug_level) {
|
||||
void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
|
||||
int debug_level = 0;
|
||||
GetIntVariable("paragraph_debug_level", &debug_level);
|
||||
if (paragraph_models_ == NULL)
|
||||
paragraph_models_ = new GenericVector<ParagraphModel*>;
|
||||
MutableIterator *result_it = GetMutableIterator();
|
||||
do { // Detect paragraphs for this block
|
||||
GenericVector<ParagraphModel *> models;
|
||||
::tesseract::DetectParagraphs(debug_level, result_it, &models);
|
||||
::tesseract::DetectParagraphs(debug_level, after_text_recognition,
|
||||
result_it, &models);
|
||||
*paragraph_models_ += models;
|
||||
} while (result_it->Next(RIL_BLOCK));
|
||||
delete result_it;
|
||||
|
@ -732,8 +732,7 @@ class TESS_API TessBaseAPI {
|
||||
TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list, PAGE_RES* pass1_result);
|
||||
|
||||
//// paragraphs.cpp ////////////////////////////////////////////////////
|
||||
/** After text is recognized, break each paragraph into blocks. */
|
||||
TESS_LOCAL void DetectParagraphs(int debug_level);
|
||||
TESS_LOCAL void DetectParagraphs(bool after_text_recognition);
|
||||
|
||||
/**
|
||||
* Extract the OCR results, costs (penalty points for uncertainty),
|
||||
|
@ -2302,9 +2302,60 @@ void DetectParagraphs(int debug_level,
|
||||
|
||||
// ============ Code interfacing with the rest of Tesseract ==================
|
||||
|
||||
void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
|
||||
RowInfo *info) {
|
||||
// Set up text, lword_text, and rword_text (mostly for debug printing).
|
||||
STRING fake_text;
|
||||
PageIterator pit(static_cast<const PageIterator&>(it));
|
||||
bool first_word = true;
|
||||
if (!pit.Empty(RIL_WORD)) {
|
||||
do {
|
||||
fake_text += "x";
|
||||
if (first_word) info->lword_text += "x";
|
||||
info->rword_text += "x";
|
||||
if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
|
||||
!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
|
||||
fake_text += " ";
|
||||
info->rword_text = "";
|
||||
first_word = false;
|
||||
}
|
||||
} while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
|
||||
pit.Next(RIL_SYMBOL));
|
||||
}
|
||||
if (fake_text.size() == 0) return;
|
||||
|
||||
int lspaces = info->pix_ldistance / info->average_interword_space;
|
||||
for (int i = 0; i < lspaces; i++) {
|
||||
info->text += ' ';
|
||||
}
|
||||
info->text += fake_text;
|
||||
|
||||
// Set up lword_box, rword_box, and num_words.
|
||||
PAGE_RES_IT page_res_it = *it.PageResIt();
|
||||
WERD_RES *word_res = page_res_it.restart_row();
|
||||
ROW_RES *this_row = page_res_it.row();
|
||||
|
||||
WERD_RES *lword = NULL;
|
||||
WERD_RES *rword = NULL;
|
||||
info->num_words = 0;
|
||||
do {
|
||||
if (word_res) {
|
||||
if (!lword) lword = word_res;
|
||||
if (rword != word_res) info->num_words++;
|
||||
rword = word_res;
|
||||
}
|
||||
word_res = page_res_it.forward();
|
||||
} while (page_res_it.row() == this_row);
|
||||
info->lword_box = lword->word->bounding_box();
|
||||
info->rword_box = rword->word->bounding_box();
|
||||
}
|
||||
|
||||
|
||||
// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
|
||||
// detector RowInfo with all relevant information from the row.
|
||||
void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
|
||||
void InitializeRowInfo(bool after_recognition,
|
||||
const MutableIterator &it,
|
||||
RowInfo *info) {
|
||||
if (it.PageResIt()->row() != NULL) {
|
||||
ROW *row = it.PageResIt()->row()->row;
|
||||
info->pix_ldistance = row->lmargin();
|
||||
@ -2324,6 +2375,20 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
|
||||
info->ltr = true;
|
||||
}
|
||||
|
||||
info->num_words = 0;
|
||||
info->lword_indicates_list_item = false;
|
||||
info->lword_likely_starts_idea = false;
|
||||
info->lword_likely_ends_idea = false;
|
||||
info->rword_indicates_list_item = false;
|
||||
info->rword_likely_starts_idea = false;
|
||||
info->rword_likely_ends_idea = false;
|
||||
info->has_leaders = false;
|
||||
info->ltr = 1;
|
||||
|
||||
if (!after_recognition) {
|
||||
InitializeTextAndBoxesPreRecognition(it, info);
|
||||
return;
|
||||
}
|
||||
info->text = "";
|
||||
char *text = it.GetUTF8Text(RIL_TEXTLINE);
|
||||
int trailing_ws_idx = strlen(text); // strip trailing space
|
||||
@ -2341,28 +2406,17 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
|
||||
}
|
||||
delete []text;
|
||||
|
||||
info->num_words = 0;
|
||||
info->lword_indicates_list_item = false;
|
||||
info->lword_likely_starts_idea = false;
|
||||
info->lword_likely_ends_idea = false;
|
||||
info->rword_indicates_list_item = false;
|
||||
info->rword_likely_starts_idea = false;
|
||||
info->rword_likely_ends_idea = false;
|
||||
|
||||
if (info->text.size() == 0) {
|
||||
info->rword_likely_ends_idea = false;
|
||||
info->rword_likely_ends_idea = false;
|
||||
return;
|
||||
}
|
||||
|
||||
int ltr = 0;
|
||||
int rtl = 0;
|
||||
|
||||
PAGE_RES_IT page_res_it = *it.PageResIt();
|
||||
GenericVector<WERD_RES *> werds;
|
||||
WERD_RES *word_res = page_res_it.restart_row();
|
||||
ROW_RES *this_row = page_res_it.row();
|
||||
int num_leaders = 0;
|
||||
int ltr = 0;
|
||||
int rtl = 0;
|
||||
do {
|
||||
if (word_res && word_res->best_choice->unichar_string().length() > 0) {
|
||||
werds.push_back(word_res);
|
||||
@ -2372,7 +2426,7 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
|
||||
}
|
||||
word_res = page_res_it.forward();
|
||||
} while (page_res_it.row() == this_row);
|
||||
|
||||
info->ltr = ltr >= rtl;
|
||||
info->has_leaders = num_leaders > 3;
|
||||
info->num_words = werds.size();
|
||||
if (werds.size() > 0) {
|
||||
@ -2392,13 +2446,13 @@ void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
|
||||
&info->rword_likely_starts_idea,
|
||||
&info->rword_likely_ends_idea);
|
||||
}
|
||||
info->ltr = ltr >= rtl;
|
||||
}
|
||||
|
||||
// This is called after rows have been identified and words are recognized.
|
||||
// Much of this could be implemented before word recognition, but text helps
|
||||
// to identify bulleted lists and gives good signals for sentence boundaries.
|
||||
void DetectParagraphs(int debug_level,
|
||||
bool after_text_recognition,
|
||||
const MutableIterator *block_start,
|
||||
GenericVector<ParagraphModel *> *models) {
|
||||
// Clear out any preconceived notions.
|
||||
@ -2422,10 +2476,29 @@ void DetectParagraphs(int debug_level,
|
||||
row.PageResIt()->row()->row->set_para(NULL);
|
||||
row_infos.push_back(RowInfo());
|
||||
RowInfo &ri = row_infos.back();
|
||||
InitializeRowInfo(row, &ri);
|
||||
InitializeRowInfo(after_text_recognition, row, &ri);
|
||||
} while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
|
||||
row.Next(RIL_TEXTLINE));
|
||||
|
||||
// If we're called before text recognition, we might not have
|
||||
// tight block bounding boxes, so trim by the minimum on each side.
|
||||
if (row_infos.size() > 0) {
|
||||
int min_lmargin = row_infos[0].pix_ldistance;
|
||||
int min_rmargin = row_infos[0].pix_rdistance;
|
||||
for (int i = 1; i < row_infos.size(); i++) {
|
||||
if (row_infos[i].pix_ldistance < min_lmargin)
|
||||
min_lmargin = row_infos[i].pix_ldistance;
|
||||
if (row_infos[i].pix_rdistance < min_rmargin)
|
||||
min_rmargin = row_infos[i].pix_rdistance;
|
||||
}
|
||||
if (min_lmargin > 0 || min_rmargin > 0) {
|
||||
for (int i = 0; i < row_infos.size(); i++) {
|
||||
row_infos[i].pix_ldistance -= min_lmargin;
|
||||
row_infos[i].pix_rdistance -= min_rmargin;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run the paragraph detection algorithm.
|
||||
GenericVector<PARA *> row_owners;
|
||||
GenericVector<PARA *> the_paragraphs;
|
||||
|
@ -99,6 +99,7 @@ void DetectParagraphs(int debug_level,
|
||||
// We use unicharset during the function to answer questions such as "is the
|
||||
// first letter of this word upper case?"
|
||||
void DetectParagraphs(int debug_level,
|
||||
bool after_text_recognition,
|
||||
const MutableIterator *block_start,
|
||||
GenericVector<ParagraphModel *> *models);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user