mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 22:43:45 +08:00
Render output in TSV format.
This commit is contained in:
parent
738fe4f757
commit
b1e4a82b0b
125
api/baseapi.cpp
125
api/baseapi.cpp
@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
|
||||
*hocr_str += "\">";
|
||||
}
|
||||
|
||||
static void AddBoxTohOCRTSV(const PageIterator *it,
|
||||
PageIteratorLevel level,
|
||||
STRING* hocr_str) {
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
hocr_str->add_str_int("\t", left);
|
||||
hocr_str->add_str_int("\t", top);
|
||||
hocr_str->add_str_int("\t", right - left + 1);
|
||||
hocr_str->add_str_int("\t", bottom - top + 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
|
||||
hocr_str += "' title='image \"";
|
||||
if (input_file_) {
|
||||
hocr_str += HOcrEscape(input_file_->string());
|
||||
} else {
|
||||
hocr_str += "unknown";
|
||||
}
|
||||
hocr_str.add_str_int("\"; bbox ", rect_left_);
|
||||
hocr_str.add_str_int(" ", rect_top_);
|
||||
hocr_str.add_str_int(" ", rect_width_);
|
||||
hocr_str.add_str_int(" ", rect_height_);
|
||||
hocr_str.add_str_int("; ppageno ", page_number);
|
||||
hocr_str += "'>\n";
|
||||
int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
|
||||
|
||||
hocr_str.add_str_int("1\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
hocr_str.add_str_int("\t", rect_left_);
|
||||
hocr_str.add_str_int("\t", rect_top_);
|
||||
hocr_str.add_str_int("\t", rect_width_);
|
||||
hocr_str.add_str_int("\t", rect_height_);
|
||||
hocr_str += "\t-1\t\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
|
||||
// Open any new block/paragraph/textline.
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
hocr_str.add_str_int(" <div class='ocr_carea' id='block_", page_id);
|
||||
hocr_str.add_str_int("_", bcnt);
|
||||
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
|
||||
block_num++, par_num = 0, line_num = 0, word_num = 0;
|
||||
hocr_str.add_str_int("2\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
|
||||
hocr_str += "\t-1\t\n";
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
if (res_it->ParagraphIsLtr()) {
|
||||
hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_",
|
||||
page_id);
|
||||
hocr_str.add_str_int("_", pcnt);
|
||||
} else {
|
||||
hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_",
|
||||
page_id);
|
||||
hocr_str.add_str_int("_", pcnt);
|
||||
}
|
||||
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
|
||||
par_num++, line_num = 0, word_num = 0;
|
||||
hocr_str.add_str_int("3\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
|
||||
hocr_str += "\t-1\t\n";
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
|
||||
hocr_str.add_str_int("_", lcnt);
|
||||
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
|
||||
line_num++, word_num = 0;
|
||||
hocr_str.add_str_int("4\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
|
||||
hocr_str += "\t-1\t\n";
|
||||
}
|
||||
|
||||
// Now, process the word...
|
||||
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
|
||||
hocr_str.add_str_int("_", wcnt);
|
||||
int left, top, right, bottom;
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int pointsize, font_id;
|
||||
@ -1697,34 +1715,21 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
|
||||
&monospace, &serif, &smallcaps,
|
||||
&pointsize, &font_id);
|
||||
hocr_str.add_str_int("' title='bbox ", left);
|
||||
hocr_str.add_str_int(" ", top);
|
||||
hocr_str.add_str_int(" ", right);
|
||||
hocr_str.add_str_int(" ", bottom);
|
||||
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
|
||||
if (font_info) {
|
||||
hocr_str += "; x_font ";
|
||||
hocr_str += HOcrEscape(font_name);
|
||||
hocr_str.add_str_int("; x_fsize ", pointsize);
|
||||
}
|
||||
hocr_str += "'";
|
||||
if (res_it->WordRecognitionLanguage()) {
|
||||
hocr_str += " lang='";
|
||||
hocr_str += res_it->WordRecognitionLanguage();
|
||||
hocr_str += "'";
|
||||
}
|
||||
switch (res_it->WordDirection()) {
|
||||
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
|
||||
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
|
||||
default: // Do nothing.
|
||||
break;
|
||||
}
|
||||
hocr_str += ">";
|
||||
word_num++;
|
||||
hocr_str.add_str_int("5\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
hocr_str.add_str_int("\t", left);
|
||||
hocr_str.add_str_int("\t", top);
|
||||
hocr_str.add_str_int("\t", right - left + 1);
|
||||
hocr_str.add_str_int("\t", bottom - top + 1);
|
||||
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
if (bold) hocr_str += "<strong>";
|
||||
if (italic) hocr_str += "<em>";
|
||||
hocr_str += "\t";
|
||||
do {
|
||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
delete []grapheme;
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
if (italic) hocr_str += "</em>";
|
||||
if (bold) hocr_str += "</strong>";
|
||||
hocr_str += "</span> ";
|
||||
hocr_str += "\n";
|
||||
wcnt++;
|
||||
// Close any ending block/paragraph/textline.
|
||||
if (last_word_in_line) {
|
||||
hocr_str += "\n </span>";
|
||||
lcnt++;
|
||||
}
|
||||
if (last_word_in_para) {
|
||||
hocr_str += "\n </p>\n";
|
||||
pcnt++;
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
hocr_str += " </div>\n";
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
hocr_str += " </div>\n";
|
||||
|
||||
char *ret = new char[hocr_str.length() + 1];
|
||||
strcpy(ret, hocr_str.string());
|
||||
|
@ -193,43 +193,20 @@ TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
|
||||
}
|
||||
|
||||
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
|
||||
AppendString(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
|
||||
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
|
||||
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
|
||||
"lang=\"en\">\n <head>\n <title>\n");
|
||||
AppendString(title());
|
||||
AppendString(
|
||||
"</title>\n"
|
||||
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
|
||||
"charset=utf-8\" />\n"
|
||||
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
|
||||
"' />\n"
|
||||
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
|
||||
" ocr_line ocrx_word");
|
||||
if (font_info_)
|
||||
AppendString(
|
||||
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
|
||||
AppendString(
|
||||
"'/>\n"
|
||||
"</head>\n<body>\n");
|
||||
|
||||
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrTsvRenderer::EndDocumentHandler() {
|
||||
AppendString(" </body>\n</html>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
char* hocr = api->GetHOCRText(imagenum());
|
||||
if (hocr == NULL) return false;
|
||||
char* hocrtsv = api->GetHOCRTSVText(imagenum());
|
||||
if (hocrtsv == NULL) return false;
|
||||
|
||||
AppendString(hocr);
|
||||
delete[] hocr;
|
||||
AppendString(hocrtsv);
|
||||
delete[] hocrtsv;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user