mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-13 06:08:52 +08:00
Added page numbers to box files
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@352 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
0bebd93363
commit
a5b4570180
@ -747,7 +747,8 @@ static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
|
|||||||
|
|
||||||
// Make a HTML-formatted string with hOCR markup from the internal
|
// Make a HTML-formatted string with hOCR markup from the internal
|
||||||
// data structures.
|
// data structures.
|
||||||
// STL removed from orignal patch submission and refactored by rays.
|
// STL removed from original patch submission and refactored by rays.
|
||||||
|
// page_id is 1-based and will appear in the output.
|
||||||
char* TessBaseAPI::GetHOCRText(int page_id) {
|
char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||||
if (tesseract_ == NULL ||
|
if (tesseract_ == NULL ||
|
||||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||||
@ -845,6 +846,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
|
|||||||
ROW_RES* row,
|
ROW_RES* row,
|
||||||
int left,
|
int left,
|
||||||
int bottom,
|
int bottom,
|
||||||
|
int image_width,
|
||||||
|
int image_height,
|
||||||
|
int page_number,
|
||||||
char* word_str) {
|
char* word_str) {
|
||||||
// Copy the output word and denormalize it back to image coords.
|
// Copy the output word and denormalize it back to image coords.
|
||||||
WERD copy_outword;
|
WERD copy_outword;
|
||||||
@ -863,9 +867,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
|
|||||||
TBOX blob_box = blob->bounding_box();
|
TBOX blob_box = blob->bounding_box();
|
||||||
if (word->tess_failed ||
|
if (word->tess_failed ||
|
||||||
blob_box.left() < 0 ||
|
blob_box.left() < 0 ||
|
||||||
blob_box.right() > page_image.get_xsize() ||
|
blob_box.right() > image_width ||
|
||||||
blob_box.bottom() < 0 ||
|
blob_box.bottom() < 0 ||
|
||||||
blob_box.top() > page_image.get_ysize()) {
|
blob_box.top() > image_height) {
|
||||||
// Bounding boxes can be illegal when tess fails on a word.
|
// Bounding boxes can be illegal when tess fails on a word.
|
||||||
blob_box = word->word->bounding_box(); // Use original word as backup.
|
blob_box = word->word->bounding_box(); // Use original word as backup.
|
||||||
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
|
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
|
||||||
@ -884,27 +888,29 @@ static int ConvertWordToBoxText(WERD_RES *word,
|
|||||||
ch = kTesseractReject;
|
ch = kTesseractReject;
|
||||||
word_str[output_size++] = ch;
|
word_str[output_size++] = ch;
|
||||||
}
|
}
|
||||||
sprintf(word_str + output_size, " %d %d %d %d\n",
|
sprintf(word_str + output_size, " %d %d %d %d %d\n",
|
||||||
blob_box.left() + left, blob_box.bottom() + bottom,
|
blob_box.left() + left, blob_box.bottom() + bottom,
|
||||||
blob_box.right() + left, blob_box.top() + bottom);
|
blob_box.right() + left, blob_box.top() + bottom,
|
||||||
|
page_number);
|
||||||
output_size += strlen(word_str + output_size);
|
output_size += strlen(word_str + output_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return output_size;
|
return output_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Multiplier for max expected textlength assumes typically 4 numbers @
|
// Multiplier for max expected textlength assumes typically 5 numbers @
|
||||||
// (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
|
// (5 digits and a space) plus the newline = 5*(5+1)+1. Add to this the
|
||||||
// orginal UTF8 characters, and one kMaxCharsPerChar.
|
// orginal UTF8 characters, and one kMaxCharsPerChar.
|
||||||
const int kCharsPerChar = 25;
|
const int kCharsPerChar = 31;
|
||||||
// A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
|
// A maximal single box could occupy 5 numbers at 20 digits (for 64 bit) and a
|
||||||
// space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
|
// space plus the newline 5*(20+1)+1 and the maximum length of a UNICHAR.
|
||||||
// Test against this on each iteration for safety.
|
// Test against this on each iteration for safety.
|
||||||
const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
|
const int kMaxCharsPerChar = 106 + UNICHAR_LEN;
|
||||||
|
|
||||||
// The recognized text is returned as a char* which is coded
|
// The recognized text is returned as a char* which is coded
|
||||||
// as a UTF8 box file and must be freed with the delete [] operator.
|
// as a UTF8 box file and must be freed with the delete [] operator.
|
||||||
char* TessBaseAPI::GetBoxText() {
|
// page_number is a 0-base page index that will appear in the box file.
|
||||||
|
char* TessBaseAPI::GetBoxText(int page_number) {
|
||||||
int bottom = image_height_ - (rect_top_ + rect_height_);
|
int bottom = image_height_ - (rect_top_ + rect_height_);
|
||||||
if (tesseract_ == NULL ||
|
if (tesseract_ == NULL ||
|
||||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||||
@ -919,7 +925,8 @@ char* TessBaseAPI::GetBoxText() {
|
|||||||
page_res_it.forward()) {
|
page_res_it.forward()) {
|
||||||
WERD_RES *word = page_res_it.word();
|
WERD_RES *word = page_res_it.word();
|
||||||
ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
|
ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
|
||||||
ptr);
|
image_width_, image_height_,
|
||||||
|
page_number, ptr);
|
||||||
// Just in case...
|
// Just in case...
|
||||||
if (ptr - result + kMaxCharsPerChar > total_length)
|
if (ptr - result + kMaxCharsPerChar > total_length)
|
||||||
break;
|
break;
|
||||||
|
@ -281,14 +281,17 @@ class TESSDLL_API TessBaseAPI {
|
|||||||
// The recognized text is returned as a char* which is coded
|
// The recognized text is returned as a char* which is coded
|
||||||
// as UTF8 and must be freed with the delete [] operator.
|
// as UTF8 and must be freed with the delete [] operator.
|
||||||
char* GetUTF8Text();
|
char* GetUTF8Text();
|
||||||
// The recognized text is returned as a char* which is coded
|
// Make a HTML-formatted string with hOCR markup from the internal
|
||||||
// as HTML with hOCR markup and must be freed with the delete [] operator.
|
// data structures.
|
||||||
char* GetHOCRText(int page_id);
|
// STL removed from original patch submission and refactored by rays.
|
||||||
|
// page_id is 1-based and will appear in the output.
|
||||||
|
char* GetHOCRText(int page_id);
|
||||||
// The recognized text is returned as a char* which is coded in the same
|
// The recognized text is returned as a char* which is coded in the same
|
||||||
// format as a box file used in training. Returned string must be freed with
|
// format as a box file used in training. Returned string must be freed with
|
||||||
// the delete [] operator.
|
// the delete [] operator.
|
||||||
// Constructs coordinates in the original image - not just the rectangle.
|
// Constructs coordinates in the original image - not just the rectangle.
|
||||||
char* GetBoxText();
|
// page_number is a 0-base page index that will appear in the box file.
|
||||||
|
char* GetBoxText(int page_number);
|
||||||
// The recognized text is returned as a char* which is coded
|
// The recognized text is returned as a char* which is coded
|
||||||
// as UNLV format Latin-1 with specific reject and suspect codes
|
// as UNLV format Latin-1 with specific reject and suspect codes
|
||||||
// and must be freed with the delete [] operator.
|
// and must be freed with the delete [] operator.
|
||||||
|
@ -101,7 +101,7 @@ char szAppName[] = "Tessedit"; //app name
|
|||||||
// the value of input_file is ignored - ugly, but true - a consequence of
|
// the value of input_file is ignored - ugly, but true - a consequence of
|
||||||
// the way that unlv zone file reading takes the place of a page layout
|
// the way that unlv zone file reading takes the place of a page layout
|
||||||
// analyzer.
|
// analyzer.
|
||||||
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
|
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
|
||||||
tesseract::TessBaseAPI* api, STRING* text_out) {
|
tesseract::TessBaseAPI* api, STRING* text_out) {
|
||||||
api->SetInputName(input_file);
|
api->SetInputName(input_file);
|
||||||
#ifdef HAVE_LIBLEPT
|
#ifdef HAVE_LIBLEPT
|
||||||
@ -120,11 +120,11 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
|
|||||||
if (tessedit_serial_unlv == 0) {
|
if (tessedit_serial_unlv == 0) {
|
||||||
char* text;
|
char* text;
|
||||||
if (tessedit_create_boxfile)
|
if (tessedit_create_boxfile)
|
||||||
text = api->GetBoxText();
|
text = api->GetBoxText(page_index);
|
||||||
else if (tessedit_write_unlv)
|
else if (tessedit_write_unlv)
|
||||||
text = api->GetUNLVText();
|
text = api->GetUNLVText();
|
||||||
else if (tessedit_create_hocr)
|
else if (tessedit_create_hocr)
|
||||||
text = api->GetHOCRText(page_id);
|
text = api->GetHOCRText(page_index + 1);
|
||||||
else
|
else
|
||||||
text = api->GetUTF8Text();
|
text = api->GetUTF8Text();
|
||||||
*text_out += text;
|
*text_out += text;
|
||||||
@ -232,7 +232,7 @@ int main(int argc, char **argv) {
|
|||||||
api.SetVariable("applybox_page", page_str);
|
api.SetVariable("applybox_page", page_str);
|
||||||
|
|
||||||
// Run tesseract on the page!
|
// Run tesseract on the page!
|
||||||
TesseractImage(argv[1], pix, page + 1, &api, &text_out);
|
TesseractImage(argv[1], NULL, pix, page, &api, &text_out);
|
||||||
pixDestroy(&pix);
|
pixDestroy(&pix);
|
||||||
if (tessedit_page_number >= 0) {
|
if (tessedit_page_number >= 0) {
|
||||||
break;
|
break;
|
||||||
@ -257,13 +257,13 @@ int main(int argc, char **argv) {
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
tprintf("Page %d : %s\n", page, filename);
|
tprintf("Page %d : %s\n", page, filename);
|
||||||
TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
|
TesseractImage(filename, NULL, pix, page, &api, &text_out);
|
||||||
pixDestroy(&pix);
|
pixDestroy(&pix);
|
||||||
++page;
|
++page;
|
||||||
}
|
}
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
} else {
|
} else {
|
||||||
TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
|
TesseractImage(argv[1], NULL, pix, 0, &api, &text_out);
|
||||||
pixDestroy(&pix);
|
pixDestroy(&pix);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -297,13 +297,13 @@ int main(int argc, char **argv) {
|
|||||||
char page_str[kMaxIntSize];
|
char page_str[kMaxIntSize];
|
||||||
snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
|
snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
|
||||||
api.SetVariable("applybox_page", page_str);
|
api.SetVariable("applybox_page", page_str);
|
||||||
++page_number;
|
|
||||||
// Read the current page into the Tesseract image.
|
// Read the current page into the Tesseract image.
|
||||||
IMAGE image;
|
IMAGE image;
|
||||||
read_tiff_image(archive, &image);
|
read_tiff_image(archive, &image);
|
||||||
|
|
||||||
// Run tesseract on the page!
|
// Run tesseract on the page!
|
||||||
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
|
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
|
||||||
|
++page_number;
|
||||||
// Do this while there are more pages in the tiff file.
|
// Do this while there are more pages in the tiff file.
|
||||||
} while (TIFFReadDirectory(archive) &&
|
} while (TIFFReadDirectory(archive) &&
|
||||||
(page_number <= tessedit_page_number || tessedit_page_number < 0));
|
(page_number <= tessedit_page_number || tessedit_page_number < 0));
|
||||||
@ -318,7 +318,7 @@ int main(int argc, char **argv) {
|
|||||||
if (image.read(image.get_ysize ()) < 0)
|
if (image.read(image.get_ysize ()) < 0)
|
||||||
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
|
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
|
||||||
invert_image(&image);
|
invert_image(&image);
|
||||||
TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
|
TesseractImage(argv[1], &image, NULL, 0, &api, &text_out);
|
||||||
#ifdef _TIFFIO_
|
#ifdef _TIFFIO_
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user