Add option to include character bounding boxes in hocr output

Add the 'hocr_char_boxes' configuration option (off by default),
which enables printing the bounding boxes of each character in the
x_bboxes property of an ocrx_word element in hocr output.
This commit is contained in:
Nick White 2016-04-27 12:13:45 +01:00
parent 1af0639f50
commit 06b7a7b188
3 changed files with 30 additions and 10 deletions

View File

@ -1467,9 +1467,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
bool para_is_ltr = true; // Default direction is LTR bool para_is_ltr = true; // Default direction is LTR
const char* paragraph_lang = NULL; const char* paragraph_lang = NULL;
bool font_info = false; bool font_info = false;
bool hocr_boxes = false;
GetBoolVariable("hocr_font_info", &font_info); GetBoolVariable("hocr_font_info", &font_info);
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
STRING hocr_str(""); STRING hocr_str("");
STRING word("");
if (input_file_ == NULL) if (input_file_ == NULL)
SetInputName(NULL); SetInputName(NULL);
@ -1564,7 +1567,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
} }
hocr_str.add_str_int("; x_fsize ", pointsize); hocr_str.add_str_int("; x_fsize ", pointsize);
} }
hocr_str += "'";
const char* lang = res_it->WordRecognitionLanguage(); const char* lang = res_it->WordRecognitionLanguage();
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
hocr_str += " lang='"; hocr_str += " lang='";
@ -1580,20 +1582,34 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
default: // Do nothing. default: // Do nothing.
break; break;
} }
hocr_str += ">";
word = "";
if (hocr_boxes) {
hocr_str += "; x_bboxes";
}
do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) {
word += HOcrEscape(grapheme);
}
delete []grapheme;
if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str.add_str_int(" ", left);
hocr_str.add_str_int(" ", top);
hocr_str.add_str_int(" ", right);
hocr_str.add_str_int(" ", bottom);
}
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
hocr_str += "'>";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
if (bold) hocr_str += "<strong>"; if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>"; if (italic) hocr_str += "<em>";
do { hocr_str += word;
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) {
hocr_str += HOcrEscape(grapheme);
}
delete []grapheme;
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) hocr_str += "</em>"; if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>"; if (bold) hocr_str += "</strong>";
hocr_str += "</span> "; hocr_str += "</span> ";

View File

@ -273,6 +273,8 @@ Tesseract::Tesseract()
"Mark v.bad words for tilde crunch", this->params()), "Mark v.bad words for tilde crunch", this->params()),
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
this->params()), this->params()),
BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
this->params()),
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
this->params()), this->params()),
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,

View File

@ -933,6 +933,8 @@ class Tesseract : public Wordrec {
"Mark v.bad words for tilde crunch"); "Mark v.bad words for tilde crunch");
BOOL_VAR_H(hocr_font_info, false, BOOL_VAR_H(hocr_font_info, false,
"Add font info to hocr output"); "Add font info to hocr output");
BOOL_VAR_H(hocr_char_boxes, false,
"Add coordinates for each character to hocr output");
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?"); BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?"); BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this"); double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");