From d71133a769cb6505e6edfd2508e0b7abb9275de7 Mon Sep 17 00:00:00 2001 From: Nick White Date: Fri, 6 May 2016 13:03:39 +0100 Subject: [PATCH] Use ocrx_cinfo to hold character box and confidence information With hocr_char_boxes enabled in hocr output, each grapheme now gets its own span tag, which holds the character confidence and box coordinates. Using x_bboxes at the ocrx_word level was inappropriate, as it was impossible to find which grapheme was represented by each bounding box. --- api/baseapi.cpp | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 88bdf13cf..0c37bfcfe 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1582,37 +1582,41 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { default: // Do nothing. break; } + hocr_str += "'>"; + + if (bold) hocr_str += ""; + if (italic) hocr_str += ""; - word = ""; - if (hocr_boxes) { - hocr_str += "; x_bboxes"; - } do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != 0) { - word += HOcrEscape(grapheme); + if (hocr_boxes) { + res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); + hocr_str += ""; + } + hocr_str += HOcrEscape(grapheme); + if (hocr_boxes) { + hocr_str += ""; + } } delete []grapheme; - if (hocr_boxes) { - res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); - hocr_str.add_str_int(" ", left); - hocr_str.add_str_int(" ", top); - hocr_str.add_str_int(" ", right); - hocr_str.add_str_int(" ", bottom); - } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - hocr_str += "'>"; - bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); - bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); - bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - if (bold) hocr_str += ""; - if (italic) hocr_str += ""; - hocr_str += word; if (italic) hocr_str += ""; if (bold) hocr_str += ""; hocr_str += " "; + + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); wcnt++; // Close any ending block/paragraph/textline. if (last_word_in_line) {