mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-22 22:47:50 +08:00
Use ocrx_cinfo to hold character box and confidence information
With hocr_char_boxes enabled in hocr output, each grapheme now gets its own span tag, which holds the character confidence and box coordinates. Using x_bboxes at the ocrx_word level was inappropriate, as it was impossible to find which grapheme was represented by each bounding box.
This commit is contained in:
parent
06b7a7b188
commit
d71133a769
@ -1582,37 +1582,41 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
||||
default: // Do nothing.
|
||||
break;
|
||||
}
|
||||
hocr_str += "'>";
|
||||
|
||||
if (bold) hocr_str += "<strong>";
|
||||
if (italic) hocr_str += "<em>";
|
||||
|
||||
word = "";
|
||||
if (hocr_boxes) {
|
||||
hocr_str += "; x_bboxes";
|
||||
}
|
||||
do {
|
||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
word += HOcrEscape(grapheme);
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str += "<span class='ocrx_cinfo' title='x_bboxes";
|
||||
hocr_str.add_str_int(" ", left);
|
||||
hocr_str.add_str_int(" ", top);
|
||||
hocr_str.add_str_int(" ", right);
|
||||
hocr_str.add_str_int(" ", bottom);
|
||||
hocr_str += "; x_conf";
|
||||
hocr_str.add_str_int(" ", res_it->Confidence(RIL_SYMBOL));
|
||||
hocr_str += "'>";
|
||||
}
|
||||
hocr_str += HOcrEscape(grapheme);
|
||||
if (hocr_boxes) {
|
||||
hocr_str += "</span>";
|
||||
}
|
||||
}
|
||||
delete []grapheme;
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str.add_str_int(" ", left);
|
||||
hocr_str.add_str_int(" ", top);
|
||||
hocr_str.add_str_int(" ", right);
|
||||
hocr_str.add_str_int(" ", bottom);
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
|
||||
hocr_str += "'>";
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
if (bold) hocr_str += "<strong>";
|
||||
if (italic) hocr_str += "<em>";
|
||||
hocr_str += word;
|
||||
if (italic) hocr_str += "</em>";
|
||||
if (bold) hocr_str += "</strong>";
|
||||
hocr_str += "</span> ";
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
wcnt++;
|
||||
// Close any ending block/paragraph/textline.
|
||||
if (last_word_in_line) {
|
||||
|
Loading…
Reference in New Issue
Block a user