Added page numbers to box files

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@352 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2010-05-20 23:06:35 +00:00
parent 0bebd93363
commit a5b4570180
3 changed files with 35 additions and 25 deletions

View File

@ -747,7 +747,8 @@ static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
// Make a HTML-formatted string with hOCR markup from the internal
// data structures.
// STL removed from orignal patch submission and refactored by rays.
// STL removed from original patch submission and refactored by rays.
// page_id is 1-based and will appear in the output.
char* TessBaseAPI::GetHOCRText(int page_id) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
@ -845,6 +846,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
ROW_RES* row,
int left,
int bottom,
int image_width,
int image_height,
int page_number,
char* word_str) {
// Copy the output word and denormalize it back to image coords.
WERD copy_outword;
@ -863,9 +867,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
TBOX blob_box = blob->bounding_box();
if (word->tess_failed ||
blob_box.left() < 0 ||
blob_box.right() > page_image.get_xsize() ||
blob_box.right() > image_width ||
blob_box.bottom() < 0 ||
blob_box.top() > page_image.get_ysize()) {
blob_box.top() > image_height) {
// Bounding boxes can be illegal when tess fails on a word.
blob_box = word->word->bounding_box(); // Use original word as backup.
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
@ -884,27 +888,29 @@ static int ConvertWordToBoxText(WERD_RES *word,
ch = kTesseractReject;
word_str[output_size++] = ch;
}
sprintf(word_str + output_size, " %d %d %d %d\n",
sprintf(word_str + output_size, " %d %d %d %d %d\n",
blob_box.left() + left, blob_box.bottom() + bottom,
blob_box.right() + left, blob_box.top() + bottom);
blob_box.right() + left, blob_box.top() + bottom,
page_number);
output_size += strlen(word_str + output_size);
}
}
return output_size;
}
// Multiplier for max expected textlength assumes typically 4 numbers @
// (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
// Multiplier for max expected textlength assumes typically 5 numbers @
// (5 digits and a space) plus the newline = 5*(5+1)+1. Add to this the
// orginal UTF8 characters, and one kMaxCharsPerChar.
const int kCharsPerChar = 25;
// A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
// space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
const int kCharsPerChar = 31;
// A maximal single box could occupy 5 numbers at 20 digits (for 64 bit) and a
// space plus the newline 5*(20+1)+1 and the maximum length of a UNICHAR.
// Test against this on each iteration for safety.
const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
const int kMaxCharsPerChar = 106 + UNICHAR_LEN;
// The recognized text is returned as a char* which is coded
// as a UTF8 box file and must be freed with the delete [] operator.
char* TessBaseAPI::GetBoxText() {
// page_number is a 0-base page index that will appear in the box file.
char* TessBaseAPI::GetBoxText(int page_number) {
int bottom = image_height_ - (rect_top_ + rect_height_);
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
@ -919,7 +925,8 @@ char* TessBaseAPI::GetBoxText() {
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
ptr);
image_width_, image_height_,
page_number, ptr);
// Just in case...
if (ptr - result + kMaxCharsPerChar > total_length)
break;

View File

@ -281,14 +281,17 @@ class TESSDLL_API TessBaseAPI {
// The recognized text is returned as a char* which is coded
// as UTF8 and must be freed with the delete [] operator.
char* GetUTF8Text();
// The recognized text is returned as a char* which is coded
// as HTML with hOCR markup and must be freed with the delete [] operator.
char* GetHOCRText(int page_id);
// Make a HTML-formatted string with hOCR markup from the internal
// data structures.
// STL removed from original patch submission and refactored by rays.
// page_id is 1-based and will appear in the output.
char* GetHOCRText(int page_id);
// The recognized text is returned as a char* which is coded in the same
// format as a box file used in training. Returned string must be freed with
// the delete [] operator.
// Constructs coordinates in the original image - not just the rectangle.
char* GetBoxText();
// page_number is a 0-base page index that will appear in the box file.
char* GetBoxText(int page_number);
// The recognized text is returned as a char* which is coded
// as UNLV format Latin-1 with specific reject and suspect codes
// and must be freed with the delete [] operator.

View File

@ -101,7 +101,7 @@ char szAppName[] = "Tessedit"; //app name
// the value of input_file is ignored - ugly, but true - a consequence of
// the way that unlv zone file reading takes the place of a page layout
// analyzer.
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
tesseract::TessBaseAPI* api, STRING* text_out) {
api->SetInputName(input_file);
#ifdef HAVE_LIBLEPT
@ -120,11 +120,11 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
if (tessedit_serial_unlv == 0) {
char* text;
if (tessedit_create_boxfile)
text = api->GetBoxText();
text = api->GetBoxText(page_index);
else if (tessedit_write_unlv)
text = api->GetUNLVText();
else if (tessedit_create_hocr)
text = api->GetHOCRText(page_id);
text = api->GetHOCRText(page_index + 1);
else
text = api->GetUTF8Text();
*text_out += text;
@ -232,7 +232,7 @@ int main(int argc, char **argv) {
api.SetVariable("applybox_page", page_str);
// Run tesseract on the page!
TesseractImage(argv[1], pix, page + 1, &api, &text_out);
TesseractImage(argv[1], NULL, pix, page, &api, &text_out);
pixDestroy(&pix);
if (tessedit_page_number >= 0) {
break;
@ -257,13 +257,13 @@ int main(int argc, char **argv) {
exit(1);
}
tprintf("Page %d : %s\n", page, filename);
TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
TesseractImage(filename, NULL, pix, page, &api, &text_out);
pixDestroy(&pix);
++page;
}
fclose(fp);
} else {
TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
TesseractImage(argv[1], NULL, pix, 0, &api, &text_out);
pixDestroy(&pix);
}
}
@ -297,13 +297,13 @@ int main(int argc, char **argv) {
char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
api.SetVariable("applybox_page", page_str);
++page_number;
// Read the current page into the Tesseract image.
IMAGE image;
read_tiff_image(archive, &image);
// Run tesseract on the page!
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
++page_number;
// Do this while there are more pages in the tiff file.
} while (TIFFReadDirectory(archive) &&
(page_number <= tessedit_page_number || tessedit_page_number < 0));
@ -318,7 +318,7 @@ int main(int argc, char **argv) {
if (image.read(image.get_ysize ()) < 0)
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
invert_image(&image);
TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
TesseractImage(argv[1], &image, NULL, 0, &api, &text_out);
#ifdef _TIFFIO_
}
#endif