Fixed issue 263 with modified patch.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@333 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-08-06 13:56:47 +08:00 · 2010-05-19 18:35:40 +00:00 · 2010-05-19 18:35:40 +00:00 · e3e78b076b
commit e3e78b076b
parent 533a671cff
4 changed files with 244 additions and 31 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -703,6 +703,144 @@ char* TessBaseAPI::GetUTF8Text() {
  return result;
 }

+// Helper returns true if there is a paragraph break between bbox_cur,
+// and bbox_prev.
+// TODO(rays) improve and incorporate deeper into tesseract, so other
+// output methods get the benefit.
+static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
+                             int right, int line_height) {
+  // Check if the distance between lines is larger than the normal leading,
+  if (fabs(bbox_cur.bottom() - bbox_prev.bottom()) > line_height * 2)
+    return true;
+  
+  // Check if the distance between left bounds of the two lines is nearly the
+  // same as between their right bounds (if so, then both lines probably belong
+  // to the same paragraph, maybe a centered one).
+  if (fabs((bbox_cur.left() - bbox_prev.left()) -
+           (bbox_prev.right() - bbox_cur.right()) < line_height))
+    return false;
+  
+  // Check if there is a paragraph indent at this line (either -ve or +ve).
+  if (fabs(bbox_cur.left() - bbox_prev.left()) > line_height)
+    return true;
+  
+  // Check if both current and previous line don't reach the right bound of the
+  // block, but the distance is different. This will cause all lines in a verse
+  // to be treated as separate paragraphs, but most probably will not split
+  // block-quotes to separate lines (at least if the text is justified).
+  if (fabs(bbox_cur.right() - bbox_prev.right()) > line_height &&
+      right - bbox_cur.right() > line_height &&
+      right - bbox_prev.right() > line_height)
+    return true;
+    
+  return false;
+}
+
+// Helper to add the hOCR for a box to the given hocr_str.
+static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
+  hocr_str->add_str_int("' title=\"bbox ", box.left());
+  hocr_str->add_str_int(" ", image_height - box.top());
+  hocr_str->add_str_int(" ", box.right());
+  hocr_str->add_str_int(" ", image_height - box.bottom());
+  *hocr_str += "\">";
+}
+
+// Make a HTML-formatted string with hOCR markup from the internal
+// data structures.
+// STL removed from orignal patch submission and refactored by rays.
+char* TessBaseAPI::GetHOCRText(int page_id) {
+  if (tesseract_ == NULL ||
+      (page_res_ == NULL && Recognize(NULL) < 0))
+    return NULL;
+  
+  PAGE_RES_IT page_res_it(page_res_);
+  ROW_RES *row = NULL;           // current row
+  ROW *real_row = NULL, *prev_row = NULL;
+  BLOCK_RES *block = NULL;       // current row
+  BLOCK *real_block = NULL;
+  int lcnt = 1, bcnt = 1, wcnt = 1;
+
+  STRING hocr_str;
+
+  hocr_str.add_str_int("<div class='ocr_page' id='page_", page_id);
+  hocr_str += "' title='image \"";
+  hocr_str += *input_file_;
+  hocr_str.add_str_int("\"; bbox ", rect_left_);
+  hocr_str.add_str_int(" ", rect_top_);
+  hocr_str.add_str_int(" ", rect_width_);
+  hocr_str.add_str_int(" ", rect_height_);
+  hocr_str += "'>\n";
+
+  for (page_res_it.restart_page(); page_res_it.word () != NULL;
+       page_res_it.forward()) {
+    if (block != page_res_it.block ()) {
+      
+      if (block != NULL) {
+        hocr_str += "</span>\n</p>\n</div>\n";
+      }
+      
+      block = page_res_it.block ();  // current row
+      real_block = block->block;
+      real_row = NULL;
+      row = NULL;
+      
+      hocr_str.add_str_int("<div class='ocr_carea' id='block_", page_id);
+      hocr_str.add_str_int("_", bcnt++);
+      AddBoxTohOCR(real_block->bounding_box(), image_height_, &hocr_str);
+      hocr_str += "\n<p class='ocr_par'>\n";
+    }
+    if (row != page_res_it.row ()) {
+      
+      if (row != NULL) {
+        hocr_str += "</span>\n";
+      }
+      prev_row = real_row;
+      
+      row = page_res_it.row ();  // current row
+      real_row = row->row;
+      
+      if (prev_row != NULL && 
+          IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(),
+                           real_block->bounding_box().right(),
+                           real_row->x_height() + real_row->ascenders()))
+        hocr_str += "</p>\n<p class='ocr_par'>\n";
+      
+      hocr_str.add_str_int("<span class='ocr_line' id='line_", page_id);
+      hocr_str.add_str_int("_", lcnt++);
+      AddBoxTohOCR(real_row->bounding_box(), image_height_, &hocr_str);
+    }
+
+    WERD_RES *word = page_res_it.word();
+    WERD_CHOICE* choice = word->best_choice;
+    if (choice != NULL) {
+      hocr_str.add_str_int("<span class='ocr_word' id='word_", page_id);
+      hocr_str.add_str_int("_", wcnt);
+      AddBoxTohOCR(word->word->bounding_box(), image_height_, &hocr_str);
+      hocr_str.add_str_int("<span class='xocr_word' id='xword_", page_id);
+      hocr_str.add_str_int("_", wcnt++);
+ 	    hocr_str.add_str_int("' title=\"x_wconf ", choice->certainty());
+      hocr_str += "\">";
+      if (word->bold > 0)
+        hocr_str += "<strong>";
+      if (word->italic > 0)
+        hocr_str += "<em>";
+      hocr_str += choice->unichar_string();
+      if (word->italic > 0)
+        hocr_str += "</em>";
+      if (word->bold > 0)
+        hocr_str += "</strong>";
+      hocr_str += "</span></span>";
+      if (!word->word->flag(W_EOL))
+        hocr_str += " ";
+    }
+  }
+  hocr_str += "</span>\n</p>\n";  
+  hocr_str += "</div>\n</div>\n";  
+  char *ret = new char[hocr_str.length() + 1];
+  strcpy(ret, hocr_str.string());
+  return ret;
+}
+
 static int ConvertWordToBoxText(WERD_RES *word,
                                ROW_RES* row,
                                int left,
--- a/api/baseapi.h
+++ b/api/baseapi.h
@ -281,6 +281,9 @@ class TESSDLL_API TessBaseAPI {
  // The recognized text is returned as a char* which is coded
  // as UTF8 and must be freed with the delete [] operator.
  char* GetUTF8Text();
+  // The recognized text is returned as a char* which is coded
+  // as HTML with hOCR markup and must be freed with the delete [] operator.
+  char* GetHOCRText(int page_id);
  // The recognized text is returned as a char* which is coded in the same
  // format as a box file used in training. Returned string must be freed with
  // the delete [] operator.
--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@ -18,6 +18,7 @@
 **********************************************************************/

 #include "mfcpch.h"
+#include <ctype.h>
 #include "applybox.h"
 #include "control.h"
 #include "tessvars.h"
@ -62,6 +63,7 @@ void read_tiff_image(TIFF* tif, IMAGE* image);
 #define EXTERN

 BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes");
+BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup");
 BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read");
 INT_VAR(tessedit_serial_unlv, 0,
        "0->Whole page, 1->serial no adapt, 2->serial with adapt");
@ -100,7 +102,7 @@ char szAppName[] = "Tessedit";   //app name
 // the value of input_file is ignored - ugly, but true - a consequence of
 // the way that unlv zone file reading takes the place of a page layout
 // analyzer.
-void TesseractImage(const char* input_file, IMAGE* image, Pix* pix,
+void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
                    tesseract::TessBaseAPI* api, STRING* text_out) {
  api->SetInputName(input_file);
 #ifdef HAVE_LIBLEPT
@ -122,6 +124,8 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix,
      text = api->GetBoxText();
    else if (tessedit_write_unlv)
      text = api->GetUNLVText();
+    else if (tessedit_create_hocr)
+      text = api->GetHOCRText(page_id);
    else
      text = api->GetUTF8Text();
    *text_out += text;
@ -198,37 +202,71 @@ int main(int argc, char **argv) {

  IMAGE image;
  STRING text_out;
+  int page_number = tessedit_page_number;
+  if (page_number < 0)
+    page_number = 0;
+  FILE* fp = fopen(argv[1], "rb");
+  if (fp == NULL) {
+    tprintf("Image file %s cannot be opened!\n", argv[1]);
+    exit(1);
+  }
 #ifdef HAVE_LIBLEPT
-  // Use leptonica to read images.
-  // If the image fails to read, try it as a list of filenames.
-  PIX* pix = pixRead(argv[1]);
-  if (pix == NULL) {
-    FILE* fp = fopen(argv[1], "r");
-    if (fp == NULL)
-      READFAILED.error(argv[0], EXIT, argv[1]);
-    char filename[MAX_PATH];
-    while (fgets(filename, sizeof(filename), fp) != NULL) {
-      chomp_string(filename);
-      pix = pixRead(filename);
-      if (pix == NULL)
-        READFAILED.error(argv[0], EXIT, argv[1]);
-      TesseractImage(argv[1], NULL, pix, &api, &text_out);
+  int page = page_number;
+  bool is_tiff = fileFormatIsTiff(fp);
+  fclose(fp);
+
+  if (is_tiff) {
+    for (; (pix = pixReadTiff(argv[1], page)) != NULL; ++page) {
+      if (page > 0)
+        tprintf("Page %d\n", page);
+      char page_str[kMaxIntSize];
+      snprintf(page_str, kMaxIntSize - 1, "%d", page);
+      api.SetVariable("applybox_page", page_str);
+
+      // Run tesseract on the page!
+      TesseractImage(argv[1], pix, page + 1, &api, &text_out);
+      pixDestroy(&pix);
+      if (tessedit_page_number >= 0) {
+        break;
+      }
+    }
+  } else {
+    // The file is not a tiff file, so use the general pixRead function.
+    // If the image fails to read, try it as a list of filenames.
+    PIX* pix = pixRead(argv[1]);
+    if (pix == NULL) {
+      FILE* fp = fopen(argv[1], "r");
+      if (fp == NULL) {
+        tprintf("File %s cannot be opened!\n", argv[1]);
+        exit(1);
+      }
+      char filename[MAX_PATH];
+      while (fgets(filename, sizeof(filename), fp) != NULL) {
+        chomp_string(filename);
+        pix = pixRead(filename);
+        if (pix == NULL) {
+          tprintf("Image file %s cannot be read!\n", filename);
+          exit(1);
+        }
+        tprintf("Page %d : %s\n", page, filename);
+        TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
+        pixDestroy(&pix);
+        ++page;
+      }
+      fclose(fp);
+    } else {
+      TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
      pixDestroy(&pix);
    }
-    fclose(fp);
-  } else {
-    TesseractImage(argv[1], NULL, pix, &api, &text_out);
-    pixDestroy(&pix);
-  }
 #else
 #ifdef _TIFFIO_
  int len = strlen(argv[1]);
-  if (len > 3 && strcmp("tif", argv[1] + len - 3) == 0) {
+  char* ext = new char[5];
+  for (int i=4; i>=0; i--)
+    ext[4-i] = (char) tolower((int) argv[1][len - i]);
+  if (len > 3 && (strcmp("tif",  ext + 1) == 0 || strcmp("tiff", ext) == 0)) {
    // Use libtiff to read a tif file so multi-page can be handled.
    // The page number so the tiff file can be closed and reopened.
-    int page_number = tessedit_page_number;
-    if (page_number < 0)
-      page_number = 0;
    TIFF* archive = NULL;
    do {
      // Since libtiff keeps all read images in memory we have to close the
@ -256,7 +294,7 @@ int main(int argc, char **argv) {
      read_tiff_image(archive, &image);

      // Run tesseract on the page!
-      TesseractImage(argv[1], &image, NULL, &api, &text_out);
+      TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
    // Do this while there are more pages in the tiff file.
    } while (TIFFReadDirectory(archive) &&
             (page_number <= tessedit_page_number || tessedit_page_number < 0));
@ -268,19 +306,35 @@ int main(int argc, char **argv) {
      READFAILED.error (argv[0], EXIT, argv[1]);
    if (image.read(image.get_ysize ()) < 0)
      MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
-    TesseractImage(argv[1], &image, NULL, &api, &text_out);
+    invert_image(&image);
+    TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
 #ifdef _TIFFIO_
  }
 #endif
 #endif  // HAVE_LIBLEPT

+  bool output_hocr = tessedit_create_hocr;
  outfile = argv[2];
-  outfile += ".txt";
-  FILE* fp = fopen(outfile.string(), "w");
-  if (fp != NULL) {
-    fwrite(text_out.string(), 1, text_out.length(), fp);
-    fclose(fp);
+  outfile += output_hocr ? ".html" : ".txt";
+  fp = fopen(outfile.string(), "w");
+  if (fp == NULL) {
+    tprintf("Cannot create output file %s\n", outfile.string());
+    exit(1);
  }
+  if (output_hocr) {
+    const char html_header[] =
+        "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
+        " \"http://www.w3.org/TR/html4/loose.dtd\">\n"
+        "<html>\n<head>\n<title></title>\n"
+        "<meta http-equiv=\"Content-Type\" content=\"text/html;"
+        "charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
+        "</head>\n<body>\n";
+    fprintf(fp, "%s", html_header);
+  } 
+  fwrite(text_out.string(), 1, text_out.length(), fp);
+  if (output_hocr)
+    fprintf(fp, "</body>\n</html>\n");
+  fclose(fp);

  return 0;                      //Normal exit
 }
--- a/image/imgio.cpp
+++ b/image/imgio.cpp
@ -65,12 +65,30 @@ static IMAGETYPE imagetypes[] = { {
    read_tif_image,
    write_intel_tif
  },
+  {
+    "TIFF",
+    open_tif_image,
+    read_tif_image,
+    write_moto_tif
+  },
+  {
+    "tiff",
+    open_tif_image,
+    read_tif_image,
+    write_intel_tif
+  },
  {
    "bmp",
    open_bmp_image,
    read_bmp_image,
    write_bmp_image
  },
+  {
+    "BMP",
+    open_bmp_image,
+    read_bmp_image,
+    write_bmp_image
+  },
 };                               //image readers/writers

 #define MAXIMAGETYPES   (sizeof(imagetypes)/sizeof(IMAGETYPE))