INCOMPATIBLE fix to hOCR line height information - fixes #225.

This fixes the duplicate line IDs caused by inserting height information into the middle of the ID and it moves the line height info into the title attribute like everything else, rather than using non-standard HTML attributes (which won't validate). This change may break consumers of the HTML output, but 3.04 has only been in the wild for 6 months and the current HTML is invalid, so I believe the benefit outweighs the cost for the fix.
2024-11-25 03:29:05 +08:00 · 2016-02-15 18:02:46 -05:00 · 2016-02-15 18:02:46 -05:00 · 431786276c
commit 431786276c
parent 3422059e09
1 changed files with 42 additions and 25 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -1378,18 +1378,42 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it,
  hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
 }

-static void AddBoxTohOCR(const PageIterator *it,
+static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) {
+  unsigned long bufsize = base.length() + 2 * kMaxIntSize;
+  char id_buffer[bufsize];
+  if (num2 >= 0) {
+    snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2);
+  } else {
+    snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1);
+  }
+  id_buffer[bufsize - 1] = '\0';
+  *hocr_str += " id='";
+  *hocr_str += id_buffer;
+  *hocr_str += "'";
+}
+
+static void AddBoxTohOCR(const ResultIterator *it,
                         PageIteratorLevel level,
                         STRING* hocr_str) {
  int left, top, right, bottom;
  it->BoundingBox(level, &left, &top, &right, &bottom);
-  hocr_str->add_str_int("' title=\"bbox ", left);
+  // This is the only place we use double quotes instead of single quotes,
+  // but it may too late to change for consistency
+  hocr_str->add_str_int(" title=\"bbox ", left);
  hocr_str->add_str_int(" ", top);
  hocr_str->add_str_int(" ", right);
  hocr_str->add_str_int(" ", bottom);
-  // Add baseline coordinates for textlines only.
-  if (level == RIL_TEXTLINE)
+  // Add baseline coordinates & heights for textlines only.
+  if (level == RIL_TEXTLINE) {
    AddBaselineCoordsTohOCR(it, level, hocr_str);
+    // add custom height measures
+    float row_height, descenders, ascenders;  // row attributes
+    it->RowAttributes(&row_height, &descenders, &ascenders);
+    // TODO: Do we want to limit these to a single decimal place?
+    hocr_str->add_str_double("; x_size ", row_height);
+    hocr_str->add_str_double("; x_descenders ", descenders * -1);
+    hocr_str->add_str_double("; x_ascenders ", ascenders);
+  }
  *hocr_str += "\">";
 }

@ -1420,7 +1444,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {

  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
  int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
-  float row_height, descenders, ascenders;  // row attributes
  bool font_info = false;
  GetBoolVariable("hocr_font_info", &font_info);

@ -1446,8 +1469,9 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
  delete[] utf8_str;
 #endif

-  hocr_str.add_str_int("  <div class='ocr_page' id='page_", page_id);
-  hocr_str += "' title='image \"";
+  hocr_str += "  <div class='ocr_page'";
+  AddIdTohOCR(&hocr_str, "page", page_id, -1);
+  hocr_str += " title='image \"";
  if (input_file_) {
    hocr_str += HOcrEscape(input_file_->string());
  } else {
@ -1469,36 +1493,29 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {

    // Open any new block/paragraph/textline.
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
-      hocr_str.add_str_int("   <div class='ocr_carea' id='block_", page_id);
-      hocr_str.add_str_int("_", bcnt);
+      hocr_str += "   <div class='ocr_carea'";
+      AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
      AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
    }
    if (res_it->IsAtBeginningOf(RIL_PARA)) {
+      hocr_str += "\n    <p class='ocr_par'";
      if (res_it->ParagraphIsLtr()) {
-        hocr_str.add_str_int("\n    <p class='ocr_par' dir='ltr' id='par_",
-                             page_id);
-        hocr_str.add_str_int("_", pcnt);
+        hocr_str += " dir='ltr'";
      } else {
-        hocr_str.add_str_int("\n    <p class='ocr_par' dir='rtl' id='par_",
-                             page_id);
-        hocr_str.add_str_int("_", pcnt);
+        hocr_str += " dir='rtl'";
      }
+      AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
      AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
    }
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
-      int fontsize;
-      hocr_str.add_str_int("\n     <span class='ocr_line' id='line_", page_id);
-      res_it->RowAttributes(&row_height, &descenders, &ascenders);
-      hocr_str.add_str_int("' size='", row_height);
-      hocr_str.add_str_int("' descenders='", descenders * -1);
-      hocr_str.add_str_int("' ascenders='", ascenders);
-      hocr_str.add_str_int("_", lcnt);
+      hocr_str += "\n     <span class='ocr_line'";
+      AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
      AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
    }

    // Now, process the word...
-    hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
-    hocr_str.add_str_int("_", wcnt);
+    hocr_str += "<span class='ocrx_word'";
+    AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
    int left, top, right, bottom;
    bool bold, italic, underlined, monospace, serif, smallcaps;
    int pointsize, font_id;
@ -1507,7 +1524,7 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
    font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
                                           &monospace, &serif, &smallcaps,
                                           &pointsize, &font_id);
-    hocr_str.add_str_int("' title='bbox ", left);
+    hocr_str.add_str_int(" title='bbox ", left);
    hocr_str.add_str_int(" ", top);
    hocr_str.add_str_int(" ", right);
    hocr_str.add_str_int(" ", bottom);