mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-23 06:57:50 +08:00
Add option to include character bounding boxes in hocr output
Add the 'hocr_char_boxes' configuration option (off by default), which enables printing the bounding boxes of each character in the x_bboxes property of an ocrx_word element in hocr output.
This commit is contained in:
parent
1af0639f50
commit
06b7a7b188
@ -1467,9 +1467,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
|||||||
bool para_is_ltr = true; // Default direction is LTR
|
bool para_is_ltr = true; // Default direction is LTR
|
||||||
const char* paragraph_lang = NULL;
|
const char* paragraph_lang = NULL;
|
||||||
bool font_info = false;
|
bool font_info = false;
|
||||||
|
bool hocr_boxes = false;
|
||||||
GetBoolVariable("hocr_font_info", &font_info);
|
GetBoolVariable("hocr_font_info", &font_info);
|
||||||
|
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
|
||||||
|
|
||||||
STRING hocr_str("");
|
STRING hocr_str("");
|
||||||
|
STRING word("");
|
||||||
|
|
||||||
if (input_file_ == NULL)
|
if (input_file_ == NULL)
|
||||||
SetInputName(NULL);
|
SetInputName(NULL);
|
||||||
@ -1564,7 +1567,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
|||||||
}
|
}
|
||||||
hocr_str.add_str_int("; x_fsize ", pointsize);
|
hocr_str.add_str_int("; x_fsize ", pointsize);
|
||||||
}
|
}
|
||||||
hocr_str += "'";
|
|
||||||
const char* lang = res_it->WordRecognitionLanguage();
|
const char* lang = res_it->WordRecognitionLanguage();
|
||||||
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
|
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
|
||||||
hocr_str += " lang='";
|
hocr_str += " lang='";
|
||||||
@ -1580,20 +1582,34 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
|||||||
default: // Do nothing.
|
default: // Do nothing.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
hocr_str += ">";
|
|
||||||
|
word = "";
|
||||||
|
if (hocr_boxes) {
|
||||||
|
hocr_str += "; x_bboxes";
|
||||||
|
}
|
||||||
|
do {
|
||||||
|
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||||
|
if (grapheme && grapheme[0] != 0) {
|
||||||
|
word += HOcrEscape(grapheme);
|
||||||
|
}
|
||||||
|
delete []grapheme;
|
||||||
|
if (hocr_boxes) {
|
||||||
|
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||||
|
hocr_str.add_str_int(" ", left);
|
||||||
|
hocr_str.add_str_int(" ", top);
|
||||||
|
hocr_str.add_str_int(" ", right);
|
||||||
|
hocr_str.add_str_int(" ", bottom);
|
||||||
|
}
|
||||||
|
res_it->Next(RIL_SYMBOL);
|
||||||
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||||
|
|
||||||
|
hocr_str += "'>";
|
||||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||||
if (bold) hocr_str += "<strong>";
|
if (bold) hocr_str += "<strong>";
|
||||||
if (italic) hocr_str += "<em>";
|
if (italic) hocr_str += "<em>";
|
||||||
do {
|
hocr_str += word;
|
||||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
|
||||||
if (grapheme && grapheme[0] != 0) {
|
|
||||||
hocr_str += HOcrEscape(grapheme);
|
|
||||||
}
|
|
||||||
delete []grapheme;
|
|
||||||
res_it->Next(RIL_SYMBOL);
|
|
||||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
|
||||||
if (italic) hocr_str += "</em>";
|
if (italic) hocr_str += "</em>";
|
||||||
if (bold) hocr_str += "</strong>";
|
if (bold) hocr_str += "</strong>";
|
||||||
hocr_str += "</span> ";
|
hocr_str += "</span> ";
|
||||||
|
@ -273,6 +273,8 @@ Tesseract::Tesseract()
|
|||||||
"Mark v.bad words for tilde crunch", this->params()),
|
"Mark v.bad words for tilde crunch", this->params()),
|
||||||
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
|
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
|
||||||
this->params()),
|
this->params()),
|
||||||
|
BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
|
||||||
|
this->params()),
|
||||||
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
|
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
|
||||||
this->params()),
|
this->params()),
|
||||||
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
|
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
|
||||||
|
@ -933,6 +933,8 @@ class Tesseract : public Wordrec {
|
|||||||
"Mark v.bad words for tilde crunch");
|
"Mark v.bad words for tilde crunch");
|
||||||
BOOL_VAR_H(hocr_font_info, false,
|
BOOL_VAR_H(hocr_font_info, false,
|
||||||
"Add font info to hocr output");
|
"Add font info to hocr output");
|
||||||
|
BOOL_VAR_H(hocr_char_boxes, false,
|
||||||
|
"Add coordinates for each character to hocr output");
|
||||||
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
|
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
|
||||||
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
|
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
|
||||||
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
|
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
|
||||||
|
Loading…
Reference in New Issue
Block a user