Use ocrx_cinfo to hold character box and confidence information

With hocr_char_boxes enabled in hocr output, each grapheme now gets its own span tag, which holds the character confidence and box coordinates. Using x_bboxes at the ocrx_word level was inappropriate, as it was impossible to find which grapheme was represented by each bounding box.
2024-12-23 06:57:50 +08:00 · 2016-05-06 13:03:39 +01:00 · 2016-05-06 13:03:39 +01:00 · d71133a769
commit d71133a769
parent 06b7a7b188
1 changed files with 23 additions and 19 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -1582,37 +1582,41 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
      default:  // Do nothing.
        break;
    }
    hocr_str += "'>";
    if (bold) hocr_str += "<strong>";
    if (italic) hocr_str += "<em>";
    word = "";
    if (hocr_boxes) {
      hocr_str += "; x_bboxes";
    }
    do {
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != 0) {
-        word += HOcrEscape(grapheme);
+        if (hocr_boxes) {
          res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
          hocr_str += "<span class='ocrx_cinfo' title='x_bboxes";
          hocr_str.add_str_int(" ", left);
          hocr_str.add_str_int(" ", top);
          hocr_str.add_str_int(" ", right);
          hocr_str.add_str_int(" ", bottom);
          hocr_str += "; x_conf";
          hocr_str.add_str_int(" ", res_it->Confidence(RIL_SYMBOL));
          hocr_str += "'>";
        }
        hocr_str += HOcrEscape(grapheme);
        if (hocr_boxes) {
          hocr_str += "</span>";
        }
      }
      delete []grapheme;
      if (hocr_boxes) {
        res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
        hocr_str.add_str_int(" ", left);
        hocr_str.add_str_int(" ", top);
        hocr_str.add_str_int(" ", right);
        hocr_str.add_str_int(" ", bottom);
      }
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
    hocr_str += "'>";
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    if (bold) hocr_str += "<strong>";
    if (italic) hocr_str += "<em>";
    hocr_str += word;
    if (italic) hocr_str += "</em>";
    if (bold) hocr_str += "</strong>";
    hocr_str += "</span> ";
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    wcnt++;
    // Close any ending block/paragraph/textline.
    if (last_word_in_line) {