Use ocrx_cinfo to hold character box and confidence information

With hocr_char_boxes enabled in hocr output, each grapheme now gets
its own span tag, which holds the character confidence and box
coordinates. Using x_bboxes at the ocrx_word level was
inappropriate, as it was impossible to find which grapheme was
represented by each bounding box.
This commit is contained in:
Nick White 2016-05-06 13:03:39 +01:00
parent 06b7a7b188
commit d71133a769

View File

@ -1582,37 +1582,41 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
default: // Do nothing. default: // Do nothing.
break; break;
} }
hocr_str += "'>";
if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>";
word = "";
if (hocr_boxes) {
hocr_str += "; x_bboxes";
}
do { do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) { if (grapheme && grapheme[0] != 0) {
word += HOcrEscape(grapheme);
}
delete []grapheme;
if (hocr_boxes) { if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str += "<span class='ocrx_cinfo' title='x_bboxes";
hocr_str.add_str_int(" ", left); hocr_str.add_str_int(" ", left);
hocr_str.add_str_int(" ", top); hocr_str.add_str_int(" ", top);
hocr_str.add_str_int(" ", right); hocr_str.add_str_int(" ", right);
hocr_str.add_str_int(" ", bottom); hocr_str.add_str_int(" ", bottom);
hocr_str += "; x_conf";
hocr_str.add_str_int(" ", res_it->Confidence(RIL_SYMBOL));
hocr_str += "'>";
} }
hocr_str += HOcrEscape(grapheme);
if (hocr_boxes) {
hocr_str += "</span>";
}
}
delete []grapheme;
res_it->Next(RIL_SYMBOL); res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
hocr_str += "'>";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>";
hocr_str += word;
if (italic) hocr_str += "</em>"; if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>"; if (bold) hocr_str += "</strong>";
hocr_str += "</span> "; hocr_str += "</span> ";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
wcnt++; wcnt++;
// Close any ending block/paragraph/textline. // Close any ending block/paragraph/textline.
if (last_word_in_line) { if (last_word_in_line) {