Add option to include character bounding boxes in hocr output

Add the 'hocr_char_boxes' configuration option (off by default), which enables printing the bounding boxes of each character in the x_bboxes property of an ocrx_word element in hocr output.
2024-12-23 06:57:50 +08:00 · 2016-04-27 12:13:45 +01:00 · 2016-04-27 12:13:45 +01:00 · 06b7a7b188
commit 06b7a7b188
parent 1af0639f50
3 changed files with 30 additions and 10 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -1467,9 +1467,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
  bool para_is_ltr = true; // Default direction is LTR
  const char* paragraph_lang = NULL;
  bool font_info = false;
  bool hocr_boxes = false;
  GetBoolVariable("hocr_font_info", &font_info);
  GetBoolVariable("hocr_char_boxes", &hocr_boxes);
  STRING hocr_str("");
  STRING word("");
  if (input_file_ == NULL)
      SetInputName(NULL);
@ -1564,7 +1567,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
      }
      hocr_str.add_str_int("; x_fsize ", pointsize);
    }
    hocr_str += "'";
    const char* lang = res_it->WordRecognitionLanguage();
    if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
      hocr_str += " lang='";
@ -1580,20 +1582,34 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
      default:  // Do nothing.
        break;
    }
-    hocr_str += ">";
+
    word = "";
    if (hocr_boxes) {
      hocr_str += "; x_bboxes";
    }
    do {
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != 0) {
        word += HOcrEscape(grapheme);
      }
      delete []grapheme;
      if (hocr_boxes) {
        res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
        hocr_str.add_str_int(" ", left);
        hocr_str.add_str_int(" ", top);
        hocr_str.add_str_int(" ", right);
        hocr_str.add_str_int(" ", bottom);
      }
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
    hocr_str += "'>";
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
    bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
    if (bold) hocr_str += "<strong>";
    if (italic) hocr_str += "<em>";
-    do {
+    hocr_str += word;
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != 0) {
        hocr_str += HOcrEscape(grapheme);
      }
      delete []grapheme;
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
    if (italic) hocr_str += "</em>";
    if (bold) hocr_str += "</strong>";
    hocr_str += "</span> ";
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@ -273,6 +273,8 @@ Tesseract::Tesseract()
                  "Mark v.bad words for tilde crunch", this->params()),
      BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
                  this->params()),
      BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
                  this->params()),
      BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
                  this->params()),
      BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@ -933,6 +933,8 @@ class Tesseract : public Wordrec {
             "Mark v.bad words for tilde crunch");
  BOOL_VAR_H(hocr_font_info, false,
             "Add font info to hocr output");
  BOOL_VAR_H(hocr_char_boxes, false,
             "Add coordinates for each character to hocr output");
  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");