Added page numbers to box files

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@352 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2010-05-20 23:06:35 +00:00
parent 0bebd93363
commit a5b4570180
3 changed files with 35 additions and 25 deletions

View File

@ -747,7 +747,8 @@ static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
// Make a HTML-formatted string with hOCR markup from the internal // Make a HTML-formatted string with hOCR markup from the internal
// data structures. // data structures.
// STL removed from orignal patch submission and refactored by rays. // STL removed from original patch submission and refactored by rays.
// page_id is 1-based and will appear in the output.
char* TessBaseAPI::GetHOCRText(int page_id) { char* TessBaseAPI::GetHOCRText(int page_id) {
if (tesseract_ == NULL || if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0)) (page_res_ == NULL && Recognize(NULL) < 0))
@ -845,6 +846,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
ROW_RES* row, ROW_RES* row,
int left, int left,
int bottom, int bottom,
int image_width,
int image_height,
int page_number,
char* word_str) { char* word_str) {
// Copy the output word and denormalize it back to image coords. // Copy the output word and denormalize it back to image coords.
WERD copy_outword; WERD copy_outword;
@ -863,9 +867,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
TBOX blob_box = blob->bounding_box(); TBOX blob_box = blob->bounding_box();
if (word->tess_failed || if (word->tess_failed ||
blob_box.left() < 0 || blob_box.left() < 0 ||
blob_box.right() > page_image.get_xsize() || blob_box.right() > image_width ||
blob_box.bottom() < 0 || blob_box.bottom() < 0 ||
blob_box.top() > page_image.get_ysize()) { blob_box.top() > image_height) {
// Bounding boxes can be illegal when tess fails on a word. // Bounding boxes can be illegal when tess fails on a word.
blob_box = word->word->bounding_box(); // Use original word as backup. blob_box = word->word->bounding_box(); // Use original word as backup.
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n", tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
@ -884,27 +888,29 @@ static int ConvertWordToBoxText(WERD_RES *word,
ch = kTesseractReject; ch = kTesseractReject;
word_str[output_size++] = ch; word_str[output_size++] = ch;
} }
sprintf(word_str + output_size, " %d %d %d %d\n", sprintf(word_str + output_size, " %d %d %d %d %d\n",
blob_box.left() + left, blob_box.bottom() + bottom, blob_box.left() + left, blob_box.bottom() + bottom,
blob_box.right() + left, blob_box.top() + bottom); blob_box.right() + left, blob_box.top() + bottom,
page_number);
output_size += strlen(word_str + output_size); output_size += strlen(word_str + output_size);
} }
} }
return output_size; return output_size;
} }
// Multiplier for max expected textlength assumes typically 4 numbers @ // Multiplier for max expected textlength assumes typically 5 numbers @
// (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the // (5 digits and a space) plus the newline = 5*(5+1)+1. Add to this the
// orginal UTF8 characters, and one kMaxCharsPerChar. // orginal UTF8 characters, and one kMaxCharsPerChar.
const int kCharsPerChar = 25; const int kCharsPerChar = 31;
// A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a // A maximal single box could occupy 5 numbers at 20 digits (for 64 bit) and a
// space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR. // space plus the newline 5*(20+1)+1 and the maximum length of a UNICHAR.
// Test against this on each iteration for safety. // Test against this on each iteration for safety.
const int kMaxCharsPerChar = 85 + UNICHAR_LEN; const int kMaxCharsPerChar = 106 + UNICHAR_LEN;
// The recognized text is returned as a char* which is coded // The recognized text is returned as a char* which is coded
// as a UTF8 box file and must be freed with the delete [] operator. // as a UTF8 box file and must be freed with the delete [] operator.
char* TessBaseAPI::GetBoxText() { // page_number is a 0-base page index that will appear in the box file.
char* TessBaseAPI::GetBoxText(int page_number) {
int bottom = image_height_ - (rect_top_ + rect_height_); int bottom = image_height_ - (rect_top_ + rect_height_);
if (tesseract_ == NULL || if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0)) (page_res_ == NULL && Recognize(NULL) < 0))
@ -919,7 +925,8 @@ char* TessBaseAPI::GetBoxText() {
page_res_it.forward()) { page_res_it.forward()) {
WERD_RES *word = page_res_it.word(); WERD_RES *word = page_res_it.word();
ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom, ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
ptr); image_width_, image_height_,
page_number, ptr);
// Just in case... // Just in case...
if (ptr - result + kMaxCharsPerChar > total_length) if (ptr - result + kMaxCharsPerChar > total_length)
break; break;

View File

@ -281,14 +281,17 @@ class TESSDLL_API TessBaseAPI {
// The recognized text is returned as a char* which is coded // The recognized text is returned as a char* which is coded
// as UTF8 and must be freed with the delete [] operator. // as UTF8 and must be freed with the delete [] operator.
char* GetUTF8Text(); char* GetUTF8Text();
// The recognized text is returned as a char* which is coded // Make a HTML-formatted string with hOCR markup from the internal
// as HTML with hOCR markup and must be freed with the delete [] operator. // data structures.
char* GetHOCRText(int page_id); // STL removed from original patch submission and refactored by rays.
// page_id is 1-based and will appear in the output.
char* GetHOCRText(int page_id);
// The recognized text is returned as a char* which is coded in the same // The recognized text is returned as a char* which is coded in the same
// format as a box file used in training. Returned string must be freed with // format as a box file used in training. Returned string must be freed with
// the delete [] operator. // the delete [] operator.
// Constructs coordinates in the original image - not just the rectangle. // Constructs coordinates in the original image - not just the rectangle.
char* GetBoxText(); // page_number is a 0-base page index that will appear in the box file.
char* GetBoxText(int page_number);
// The recognized text is returned as a char* which is coded // The recognized text is returned as a char* which is coded
// as UNLV format Latin-1 with specific reject and suspect codes // as UNLV format Latin-1 with specific reject and suspect codes
// and must be freed with the delete [] operator. // and must be freed with the delete [] operator.

View File

@ -101,7 +101,7 @@ char szAppName[] = "Tessedit"; //app name
// the value of input_file is ignored - ugly, but true - a consequence of // the value of input_file is ignored - ugly, but true - a consequence of
// the way that unlv zone file reading takes the place of a page layout // the way that unlv zone file reading takes the place of a page layout
// analyzer. // analyzer.
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id, void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
tesseract::TessBaseAPI* api, STRING* text_out) { tesseract::TessBaseAPI* api, STRING* text_out) {
api->SetInputName(input_file); api->SetInputName(input_file);
#ifdef HAVE_LIBLEPT #ifdef HAVE_LIBLEPT
@ -120,11 +120,11 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
if (tessedit_serial_unlv == 0) { if (tessedit_serial_unlv == 0) {
char* text; char* text;
if (tessedit_create_boxfile) if (tessedit_create_boxfile)
text = api->GetBoxText(); text = api->GetBoxText(page_index);
else if (tessedit_write_unlv) else if (tessedit_write_unlv)
text = api->GetUNLVText(); text = api->GetUNLVText();
else if (tessedit_create_hocr) else if (tessedit_create_hocr)
text = api->GetHOCRText(page_id); text = api->GetHOCRText(page_index + 1);
else else
text = api->GetUTF8Text(); text = api->GetUTF8Text();
*text_out += text; *text_out += text;
@ -232,7 +232,7 @@ int main(int argc, char **argv) {
api.SetVariable("applybox_page", page_str); api.SetVariable("applybox_page", page_str);
// Run tesseract on the page! // Run tesseract on the page!
TesseractImage(argv[1], pix, page + 1, &api, &text_out); TesseractImage(argv[1], NULL, pix, page, &api, &text_out);
pixDestroy(&pix); pixDestroy(&pix);
if (tessedit_page_number >= 0) { if (tessedit_page_number >= 0) {
break; break;
@ -257,13 +257,13 @@ int main(int argc, char **argv) {
exit(1); exit(1);
} }
tprintf("Page %d : %s\n", page, filename); tprintf("Page %d : %s\n", page, filename);
TesseractImage(filename, NULL, pix, page + 1, &api, &text_out); TesseractImage(filename, NULL, pix, page, &api, &text_out);
pixDestroy(&pix); pixDestroy(&pix);
++page; ++page;
} }
fclose(fp); fclose(fp);
} else { } else {
TesseractImage(argv[1], NULL, pix, 1, &api, &text_out); TesseractImage(argv[1], NULL, pix, 0, &api, &text_out);
pixDestroy(&pix); pixDestroy(&pix);
} }
} }
@ -297,13 +297,13 @@ int main(int argc, char **argv) {
char page_str[kMaxIntSize]; char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page_number); snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
api.SetVariable("applybox_page", page_str); api.SetVariable("applybox_page", page_str);
++page_number;
// Read the current page into the Tesseract image. // Read the current page into the Tesseract image.
IMAGE image; IMAGE image;
read_tiff_image(archive, &image); read_tiff_image(archive, &image);
// Run tesseract on the page! // Run tesseract on the page!
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out); TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
++page_number;
// Do this while there are more pages in the tiff file. // Do this while there are more pages in the tiff file.
} while (TIFFReadDirectory(archive) && } while (TIFFReadDirectory(archive) &&
(page_number <= tessedit_page_number || tessedit_page_number < 0)); (page_number <= tessedit_page_number || tessedit_page_number < 0));
@ -318,7 +318,7 @@ int main(int argc, char **argv) {
if (image.read(image.get_ysize ()) < 0) if (image.read(image.get_ysize ()) < 0)
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]); MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
invert_image(&image); invert_image(&image);
TesseractImage(argv[1], &image, NULL, 1, &api, &text_out); TesseractImage(argv[1], &image, NULL, 0, &api, &text_out);
#ifdef _TIFFIO_ #ifdef _TIFFIO_
} }
#endif #endif