Use ocrx_cinfo to hold character box and confidence information

With hocr_char_boxes enabled in hocr output, each grapheme now gets its own span tag, which holds the character confidence and box coordinates. Using x_bboxes at the ocrx_word level was inappropriate, as it was impossible to find which grapheme was represented by each bounding box.
2024-12-22 22:47:50 +08:00 · 2016-05-06 13:03:39 +01:00 · 2016-05-06 13:03:39 +01:00 · d71133a769
commit d71133a769
parent 06b7a7b188
1 changed files with 23 additions and 19 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -1582,37 +1582,41 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
      default:  // Do nothing.
        break;
    }
+    hocr_str += "'>";
+
+    if (bold) hocr_str += "<strong>";
+    if (italic) hocr_str += "<em>";

-    word = "";
-    if (hocr_boxes) {
-      hocr_str += "; x_bboxes";
-    }
    do {
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != 0) {
-        word += HOcrEscape(grapheme);
+        if (hocr_boxes) {
+          res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
+          hocr_str += "<span class='ocrx_cinfo' title='x_bboxes";
+          hocr_str.add_str_int(" ", left);
+          hocr_str.add_str_int(" ", top);
+          hocr_str.add_str_int(" ", right);
+          hocr_str.add_str_int(" ", bottom);
+          hocr_str += "; x_conf";
+          hocr_str.add_str_int(" ", res_it->Confidence(RIL_SYMBOL));
+          hocr_str += "'>";
+        }
+        hocr_str += HOcrEscape(grapheme);
+        if (hocr_boxes) {
+          hocr_str += "</span>";
+        }
      }
      delete []grapheme;
-      if (hocr_boxes) {
-        res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
-        hocr_str.add_str_int(" ", left);
-        hocr_str.add_str_int(" ", top);
-        hocr_str.add_str_int(" ", right);
-        hocr_str.add_str_int(" ", bottom);
-      }
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));

-    hocr_str += "'>";
-    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
-    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
-    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
-    if (bold) hocr_str += "<strong>";
-    if (italic) hocr_str += "<em>";
-    hocr_str += word;
    if (italic) hocr_str += "</em>";
    if (bold) hocr_str += "</strong>";
    hocr_str += "</span> ";
+
+    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
+    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
+    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    wcnt++;
    // Close any ending block/paragraph/textline.
    if (last_word_in_line) {