mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-23 06:57:50 +08:00
Use ocrx_cinfo to hold character box and confidence information
With hocr_char_boxes enabled in hocr output, each grapheme now gets its own span tag, which holds the character confidence and box coordinates. Using x_bboxes at the ocrx_word level was inappropriate, as it was impossible to find which grapheme was represented by each bounding box.
This commit is contained in:
parent
06b7a7b188
commit
d71133a769
@ -1582,37 +1582,41 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
|||||||
default: // Do nothing.
|
default: // Do nothing.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
hocr_str += "'>";
|
||||||
|
|
||||||
|
if (bold) hocr_str += "<strong>";
|
||||||
|
if (italic) hocr_str += "<em>";
|
||||||
|
|
||||||
word = "";
|
|
||||||
if (hocr_boxes) {
|
|
||||||
hocr_str += "; x_bboxes";
|
|
||||||
}
|
|
||||||
do {
|
do {
|
||||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||||
if (grapheme && grapheme[0] != 0) {
|
if (grapheme && grapheme[0] != 0) {
|
||||||
word += HOcrEscape(grapheme);
|
if (hocr_boxes) {
|
||||||
|
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||||
|
hocr_str += "<span class='ocrx_cinfo' title='x_bboxes";
|
||||||
|
hocr_str.add_str_int(" ", left);
|
||||||
|
hocr_str.add_str_int(" ", top);
|
||||||
|
hocr_str.add_str_int(" ", right);
|
||||||
|
hocr_str.add_str_int(" ", bottom);
|
||||||
|
hocr_str += "; x_conf";
|
||||||
|
hocr_str.add_str_int(" ", res_it->Confidence(RIL_SYMBOL));
|
||||||
|
hocr_str += "'>";
|
||||||
|
}
|
||||||
|
hocr_str += HOcrEscape(grapheme);
|
||||||
|
if (hocr_boxes) {
|
||||||
|
hocr_str += "</span>";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
delete []grapheme;
|
delete []grapheme;
|
||||||
if (hocr_boxes) {
|
|
||||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
|
||||||
hocr_str.add_str_int(" ", left);
|
|
||||||
hocr_str.add_str_int(" ", top);
|
|
||||||
hocr_str.add_str_int(" ", right);
|
|
||||||
hocr_str.add_str_int(" ", bottom);
|
|
||||||
}
|
|
||||||
res_it->Next(RIL_SYMBOL);
|
res_it->Next(RIL_SYMBOL);
|
||||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||||
|
|
||||||
hocr_str += "'>";
|
|
||||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
|
||||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
|
||||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
|
||||||
if (bold) hocr_str += "<strong>";
|
|
||||||
if (italic) hocr_str += "<em>";
|
|
||||||
hocr_str += word;
|
|
||||||
if (italic) hocr_str += "</em>";
|
if (italic) hocr_str += "</em>";
|
||||||
if (bold) hocr_str += "</strong>";
|
if (bold) hocr_str += "</strong>";
|
||||||
hocr_str += "</span> ";
|
hocr_str += "</span> ";
|
||||||
|
|
||||||
|
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||||
|
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||||
|
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||||
wcnt++;
|
wcnt++;
|
||||||
// Close any ending block/paragraph/textline.
|
// Close any ending block/paragraph/textline.
|
||||||
if (last_word_in_line) {
|
if (last_word_in_line) {
|
||||||
|
Loading…
Reference in New Issue
Block a user