Added page numbers to box files

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@352 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-12 05:13:14 +08:00 · 2010-05-20 23:06:35 +00:00 · 2010-05-20 23:06:35 +00:00 · a5b4570180
commit a5b4570180
parent 0bebd93363
3 changed files with 35 additions and 25 deletions
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -747,7 +747,8 @@ static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {

 // Make a HTML-formatted string with hOCR markup from the internal
 // data structures.
-// STL removed from orignal patch submission and refactored by rays.
+// STL removed from original patch submission and refactored by rays.
+// page_id is 1-based and will appear in the output.
 char* TessBaseAPI::GetHOCRText(int page_id) {
  if (tesseract_ == NULL ||
      (page_res_ == NULL && Recognize(NULL) < 0))
@ -845,6 +846,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
                                ROW_RES* row,
                                int left,
                                int bottom,
+                                int image_width,
+                                int image_height,
+                                int page_number,
                                char* word_str) {
  // Copy the output word and denormalize it back to image coords.
  WERD copy_outword;
@ -863,9 +867,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
      TBOX blob_box = blob->bounding_box();
      if (word->tess_failed ||
          blob_box.left() < 0 ||
-          blob_box.right() > page_image.get_xsize() ||
+          blob_box.right() > image_width ||
          blob_box.bottom() < 0 ||
-          blob_box.top() > page_image.get_ysize()) {
+          blob_box.top() > image_height) {
        // Bounding boxes can be illegal when tess fails on a word.
        blob_box = word->word->bounding_box();  // Use original word as backup.
        tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
@ -884,27 +888,29 @@ static int ConvertWordToBoxText(WERD_RES *word,
          ch = kTesseractReject;
        word_str[output_size++] = ch;
      }
-      sprintf(word_str + output_size, " %d %d %d %d\n",
+      sprintf(word_str + output_size, " %d %d %d %d %d\n",
              blob_box.left() + left, blob_box.bottom() + bottom,
-              blob_box.right() + left, blob_box.top() + bottom);
+              blob_box.right() + left, blob_box.top() + bottom,
+              page_number);
      output_size += strlen(word_str + output_size);
    }
  }
  return output_size;
 }

-// Multiplier for max expected textlength assumes typically 4 numbers @
-// (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
+// Multiplier for max expected textlength assumes typically 5 numbers @
+// (5 digits and a space) plus the newline = 5*(5+1)+1. Add to this the
 // orginal UTF8 characters, and one kMaxCharsPerChar.
-const int kCharsPerChar = 25;
-// A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
-// space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
+const int kCharsPerChar = 31;
+// A maximal single box could occupy 5 numbers at 20 digits (for 64 bit) and a
+// space plus the newline 5*(20+1)+1 and the maximum length of a UNICHAR.
 // Test against this on each iteration for safety.
-const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
+const int kMaxCharsPerChar = 106 + UNICHAR_LEN;

 // The recognized text is returned as a char* which is coded
 // as a UTF8 box file and must be freed with the delete [] operator.
-char* TessBaseAPI::GetBoxText() {
+// page_number is a 0-base page index that will appear in the box file.
+char* TessBaseAPI::GetBoxText(int page_number) {
  int bottom = image_height_ - (rect_top_ + rect_height_);
  if (tesseract_ == NULL ||
      (page_res_ == NULL && Recognize(NULL) < 0))
@ -919,7 +925,8 @@ char* TessBaseAPI::GetBoxText() {
       page_res_it.forward()) {
    WERD_RES *word = page_res_it.word();
    ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
-                                ptr);
+                                image_width_, image_height_,
+                                page_number, ptr);
    // Just in case...
    if (ptr - result + kMaxCharsPerChar > total_length)
      break;
--- a/api/baseapi.h
+++ b/api/baseapi.h
@ -281,14 +281,17 @@ class TESSDLL_API TessBaseAPI {
  // The recognized text is returned as a char* which is coded
  // as UTF8 and must be freed with the delete [] operator.
  char* GetUTF8Text();
-  // The recognized text is returned as a char* which is coded
-  // as HTML with hOCR markup and must be freed with the delete [] operator.
-  char* GetHOCRText(int page_id);
+  // Make a HTML-formatted string with hOCR markup from the internal
+  // data structures.
+  // STL removed from original patch submission and refactored by rays.
+  // page_id is 1-based and will appear in the output.
+ char* GetHOCRText(int page_id);
  // The recognized text is returned as a char* which is coded in the same
  // format as a box file used in training. Returned string must be freed with
  // the delete [] operator.
  // Constructs coordinates in the original image - not just the rectangle.
-  char* GetBoxText();
+  // page_number is a 0-base page index that will appear in the box file.
+  char* GetBoxText(int page_number);
  // The recognized text is returned as a char* which is coded
  // as UNLV format Latin-1 with specific reject and suspect codes
  // and must be freed with the delete [] operator.
--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@ -101,7 +101,7 @@ char szAppName[] = "Tessedit";   //app name
 // the value of input_file is ignored - ugly, but true - a consequence of
 // the way that unlv zone file reading takes the place of a page layout
 // analyzer.
-void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
+void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
                    tesseract::TessBaseAPI* api, STRING* text_out) {
  api->SetInputName(input_file);
 #ifdef HAVE_LIBLEPT
@ -120,11 +120,11 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
  if (tessedit_serial_unlv == 0) {
    char* text;
    if (tessedit_create_boxfile)
-      text = api->GetBoxText();
+      text = api->GetBoxText(page_index);
    else if (tessedit_write_unlv)
      text = api->GetUNLVText();
    else if (tessedit_create_hocr)
-      text = api->GetHOCRText(page_id);
+      text = api->GetHOCRText(page_index + 1);
    else
      text = api->GetUTF8Text();
    *text_out += text;
@ -232,7 +232,7 @@ int main(int argc, char **argv) {
      api.SetVariable("applybox_page", page_str);

      // Run tesseract on the page!
-      TesseractImage(argv[1], pix, page + 1, &api, &text_out);
+      TesseractImage(argv[1], NULL, pix, page, &api, &text_out);
      pixDestroy(&pix);
      if (tessedit_page_number >= 0) {
        break;
@ -257,13 +257,13 @@ int main(int argc, char **argv) {
          exit(1);
        }
        tprintf("Page %d : %s\n", page, filename);
-        TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
+        TesseractImage(filename, NULL, pix, page, &api, &text_out);
        pixDestroy(&pix);
        ++page;
      }
      fclose(fp);
    } else {
-      TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
+      TesseractImage(argv[1], NULL, pix, 0, &api, &text_out);
      pixDestroy(&pix);
    }
  }
@ -297,13 +297,13 @@ int main(int argc, char **argv) {
      char page_str[kMaxIntSize];
      snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
      api.SetVariable("applybox_page", page_str);
-      ++page_number;
      // Read the current page into the Tesseract image.
      IMAGE image;
      read_tiff_image(archive, &image);

      // Run tesseract on the page!
      TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
+      ++page_number;
    // Do this while there are more pages in the tiff file.
    } while (TIFFReadDirectory(archive) &&
             (page_number <= tessedit_page_number || tessedit_page_number < 0));
@ -318,7 +318,7 @@ int main(int argc, char **argv) {
    if (image.read(image.get_ysize ()) < 0)
      MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
    invert_image(&image);
-    TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
+    TesseractImage(argv[1], &image, NULL, 0, &api, &text_out);
 #ifdef _TIFFIO_
  }
 #endif