mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-22 22:47:50 +08:00
Add option to include character bounding boxes in hocr output
Add the 'hocr_char_boxes' configuration option (off by default), which enables printing the bounding boxes of each character in the x_bboxes property of an ocrx_word element in hocr output.
This commit is contained in:
parent
1af0639f50
commit
06b7a7b188
@ -1467,9 +1467,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
||||
bool para_is_ltr = true; // Default direction is LTR
|
||||
const char* paragraph_lang = NULL;
|
||||
bool font_info = false;
|
||||
bool hocr_boxes = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
|
||||
|
||||
STRING hocr_str("");
|
||||
STRING word("");
|
||||
|
||||
if (input_file_ == NULL)
|
||||
SetInputName(NULL);
|
||||
@ -1564,7 +1567,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
||||
}
|
||||
hocr_str.add_str_int("; x_fsize ", pointsize);
|
||||
}
|
||||
hocr_str += "'";
|
||||
const char* lang = res_it->WordRecognitionLanguage();
|
||||
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
|
||||
hocr_str += " lang='";
|
||||
@ -1580,20 +1582,34 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
||||
default: // Do nothing.
|
||||
break;
|
||||
}
|
||||
hocr_str += ">";
|
||||
|
||||
word = "";
|
||||
if (hocr_boxes) {
|
||||
hocr_str += "; x_bboxes";
|
||||
}
|
||||
do {
|
||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
word += HOcrEscape(grapheme);
|
||||
}
|
||||
delete []grapheme;
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str.add_str_int(" ", left);
|
||||
hocr_str.add_str_int(" ", top);
|
||||
hocr_str.add_str_int(" ", right);
|
||||
hocr_str.add_str_int(" ", bottom);
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
|
||||
hocr_str += "'>";
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
if (bold) hocr_str += "<strong>";
|
||||
if (italic) hocr_str += "<em>";
|
||||
do {
|
||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
hocr_str += HOcrEscape(grapheme);
|
||||
}
|
||||
delete []grapheme;
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
hocr_str += word;
|
||||
if (italic) hocr_str += "</em>";
|
||||
if (bold) hocr_str += "</strong>";
|
||||
hocr_str += "</span> ";
|
||||
|
@ -273,6 +273,8 @@ Tesseract::Tesseract()
|
||||
"Mark v.bad words for tilde crunch", this->params()),
|
||||
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
|
||||
this->params()),
|
||||
BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
|
||||
this->params()),
|
||||
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
|
||||
this->params()),
|
||||
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
|
||||
|
@ -933,6 +933,8 @@ class Tesseract : public Wordrec {
|
||||
"Mark v.bad words for tilde crunch");
|
||||
BOOL_VAR_H(hocr_font_info, false,
|
||||
"Add font info to hocr output");
|
||||
BOOL_VAR_H(hocr_char_boxes, false,
|
||||
"Add coordinates for each character to hocr output");
|
||||
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
|
||||
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
|
||||
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
|
||||
|
Loading…
Reference in New Issue
Block a user