Render output in TSV format.

This commit is contained in:
Sundar M. Vaidya 2014-08-22 14:45:53 +05:45 committed by Tom Morris
parent 738fe4f757
commit b1e4a82b0b
2 changed files with 67 additions and 91 deletions

View File

@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
*hocr_str += "\">";
}
static void AddBoxTohOCRTSV(const PageIterator *it,
PageIteratorLevel level,
STRING* hocr_str) {
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);
hocr_str->add_str_int("\t", left);
hocr_str->add_str_int("\t", top);
hocr_str->add_str_int("\t", right - left + 1);
hocr_str->add_str_int("\t", bottom - top + 1);
}
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
delete[] utf8_str;
#endif
hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
hocr_str += "' title='image \"";
if (input_file_) {
hocr_str += HOcrEscape(input_file_->string());
} else {
hocr_str += "unknown";
}
hocr_str.add_str_int("\"; bbox ", rect_left_);
hocr_str.add_str_int(" ", rect_top_);
hocr_str.add_str_int(" ", rect_width_);
hocr_str.add_str_int(" ", rect_height_);
hocr_str.add_str_int("; ppageno ", page_number);
hocr_str += "'>\n";
int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
hocr_str.add_str_int("1\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
hocr_str.add_str_int("\t", rect_left_);
hocr_str.add_str_int("\t", rect_top_);
hocr_str.add_str_int("\t", rect_width_);
hocr_str.add_str_int("\t", rect_height_);
hocr_str += "\t-1\t\n";
ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
// Open any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
hocr_str.add_str_int(" <div class='ocr_carea' id='block_", page_id);
hocr_str.add_str_int("_", bcnt);
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
block_num++, par_num = 0, line_num = 0, word_num = 0;
hocr_str.add_str_int("2\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
hocr_str += "\t-1\t\n";
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
if (res_it->ParagraphIsLtr()) {
hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_",
page_id);
hocr_str.add_str_int("_", pcnt);
} else {
hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_",
page_id);
hocr_str.add_str_int("_", pcnt);
}
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
par_num++, line_num = 0, word_num = 0;
hocr_str.add_str_int("3\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
hocr_str += "\t-1\t\n";
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
hocr_str.add_str_int("_", lcnt);
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
line_num++, word_num = 0;
hocr_str.add_str_int("4\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
hocr_str += "\t-1\t\n";
}
// Now, process the word...
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
hocr_str.add_str_int("_", wcnt);
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
@ -1697,34 +1715,21 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
hocr_str.add_str_int("' title='bbox ", left);
hocr_str.add_str_int(" ", top);
hocr_str.add_str_int(" ", right);
hocr_str.add_str_int(" ", bottom);
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
if (font_info) {
hocr_str += "; x_font ";
hocr_str += HOcrEscape(font_name);
hocr_str.add_str_int("; x_fsize ", pointsize);
}
hocr_str += "'";
if (res_it->WordRecognitionLanguage()) {
hocr_str += " lang='";
hocr_str += res_it->WordRecognitionLanguage();
hocr_str += "'";
}
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
default: // Do nothing.
break;
}
hocr_str += ">";
word_num++;
hocr_str.add_str_int("5\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
hocr_str.add_str_int("\t", left);
hocr_str.add_str_int("\t", top);
hocr_str.add_str_int("\t", right - left + 1);
hocr_str.add_str_int("\t", bottom - top + 1);
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>";
hocr_str += "\t";
do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) {
@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
delete []grapheme;
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>";
hocr_str += "</span> ";
hocr_str += "\n";
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
hocr_str += "\n </span>";
lcnt++;
}
if (last_word_in_para) {
hocr_str += "\n </p>\n";
pcnt++;
}
if (last_word_in_block) {
hocr_str += " </div>\n";
bcnt++;
}
}
hocr_str += " </div>\n";
char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());

View File

@ -193,43 +193,20 @@ TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
}
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
"lang=\"en\">\n <head>\n <title>\n");
AppendString(title());
AppendString(
"</title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" />\n"
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
"' />\n"
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
" ocr_line ocrx_word");
if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
AppendString(
"'/>\n"
"</head>\n<body>\n");
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
return true;
}
bool TessHOcrTsvRenderer::EndDocumentHandler() {
AppendString(" </body>\n</html>\n");
return true;
}
bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* hocr = api->GetHOCRText(imagenum());
if (hocr == NULL) return false;
char* hocrtsv = api->GetHOCRTSVText(imagenum());
if (hocrtsv == NULL) return false;
AppendString(hocr);
delete[] hocr;
AppendString(hocrtsv);
delete[] hocrtsv;
return true;
}