Add option to include character bounding boxes in hocr output

Add the 'hocr_char_boxes' configuration option (off by default), which enables printing the bounding boxes of each character in the x_bboxes property of an ocrx_word element in hocr output.
2024-12-22 22:47:50 +08:00 · 2016-04-27 12:13:45 +01:00 · 2016-04-27 12:13:45 +01:00 · 06b7a7b188
commit 06b7a7b188
parent 1af0639f50
3 changed files with 30 additions and 10 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -1467,9 +1467,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
  bool para_is_ltr = true; // Default direction is LTR
  const char* paragraph_lang = NULL;
  bool font_info = false;
+  bool hocr_boxes = false;
  GetBoolVariable("hocr_font_info", &font_info);
+  GetBoolVariable("hocr_char_boxes", &hocr_boxes);

  STRING hocr_str("");
+  STRING word("");

  if (input_file_ == NULL)
      SetInputName(NULL);
@ -1564,7 +1567,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
      }
      hocr_str.add_str_int("; x_fsize ", pointsize);
    }
-    hocr_str += "'";
    const char* lang = res_it->WordRecognitionLanguage();
    if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
      hocr_str += " lang='";
@ -1580,20 +1582,34 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
      default:  // Do nothing.
        break;
    }
-    hocr_str += ">";
+
+    word = "";
+    if (hocr_boxes) {
+      hocr_str += "; x_bboxes";
+    }
+    do {
+      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
+      if (grapheme && grapheme[0] != 0) {
+        word += HOcrEscape(grapheme);
+      }
+      delete []grapheme;
+      if (hocr_boxes) {
+        res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
+        hocr_str.add_str_int(" ", left);
+        hocr_str.add_str_int(" ", top);
+        hocr_str.add_str_int(" ", right);
+        hocr_str.add_str_int(" ", bottom);
+      }
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+
+    hocr_str += "'>";
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    if (bold) hocr_str += "<strong>";
    if (italic) hocr_str += "<em>";
-    do {
-      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
-      if (grapheme && grapheme[0] != 0) {
-        hocr_str += HOcrEscape(grapheme);
-      }
-      delete []grapheme;
-      res_it->Next(RIL_SYMBOL);
-    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
+    hocr_str += word;
    if (italic) hocr_str += "</em>";
    if (bold) hocr_str += "</strong>";
    hocr_str += "</span> ";
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@ -273,6 +273,8 @@ Tesseract::Tesseract()
                  "Mark v.bad words for tilde crunch", this->params()),
      BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
                  this->params()),
+      BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
+                  this->params()),
      BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
                  this->params()),
      BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@ -933,6 +933,8 @@ class Tesseract : public Wordrec {
             "Mark v.bad words for tilde crunch");
  BOOL_VAR_H(hocr_font_info, false,
             "Add font info to hocr output");
+  BOOL_VAR_H(hocr_char_boxes, false,
+             "Add coordinates for each character to hocr output");
  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");