mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-12 05:13:14 +08:00
Added page numbers to box files
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@352 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
0bebd93363
commit
a5b4570180
@ -747,7 +747,8 @@ static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
|
||||
|
||||
// Make a HTML-formatted string with hOCR markup from the internal
|
||||
// data structures.
|
||||
// STL removed from orignal patch submission and refactored by rays.
|
||||
// STL removed from original patch submission and refactored by rays.
|
||||
// page_id is 1-based and will appear in the output.
|
||||
char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||
if (tesseract_ == NULL ||
|
||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||
@ -845,6 +846,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
|
||||
ROW_RES* row,
|
||||
int left,
|
||||
int bottom,
|
||||
int image_width,
|
||||
int image_height,
|
||||
int page_number,
|
||||
char* word_str) {
|
||||
// Copy the output word and denormalize it back to image coords.
|
||||
WERD copy_outword;
|
||||
@ -863,9 +867,9 @@ static int ConvertWordToBoxText(WERD_RES *word,
|
||||
TBOX blob_box = blob->bounding_box();
|
||||
if (word->tess_failed ||
|
||||
blob_box.left() < 0 ||
|
||||
blob_box.right() > page_image.get_xsize() ||
|
||||
blob_box.right() > image_width ||
|
||||
blob_box.bottom() < 0 ||
|
||||
blob_box.top() > page_image.get_ysize()) {
|
||||
blob_box.top() > image_height) {
|
||||
// Bounding boxes can be illegal when tess fails on a word.
|
||||
blob_box = word->word->bounding_box(); // Use original word as backup.
|
||||
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
|
||||
@ -884,27 +888,29 @@ static int ConvertWordToBoxText(WERD_RES *word,
|
||||
ch = kTesseractReject;
|
||||
word_str[output_size++] = ch;
|
||||
}
|
||||
sprintf(word_str + output_size, " %d %d %d %d\n",
|
||||
sprintf(word_str + output_size, " %d %d %d %d %d\n",
|
||||
blob_box.left() + left, blob_box.bottom() + bottom,
|
||||
blob_box.right() + left, blob_box.top() + bottom);
|
||||
blob_box.right() + left, blob_box.top() + bottom,
|
||||
page_number);
|
||||
output_size += strlen(word_str + output_size);
|
||||
}
|
||||
}
|
||||
return output_size;
|
||||
}
|
||||
|
||||
// Multiplier for max expected textlength assumes typically 4 numbers @
|
||||
// (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
|
||||
// Multiplier for max expected textlength assumes typically 5 numbers @
|
||||
// (5 digits and a space) plus the newline = 5*(5+1)+1. Add to this the
|
||||
// orginal UTF8 characters, and one kMaxCharsPerChar.
|
||||
const int kCharsPerChar = 25;
|
||||
// A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
|
||||
// space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
|
||||
const int kCharsPerChar = 31;
|
||||
// A maximal single box could occupy 5 numbers at 20 digits (for 64 bit) and a
|
||||
// space plus the newline 5*(20+1)+1 and the maximum length of a UNICHAR.
|
||||
// Test against this on each iteration for safety.
|
||||
const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
|
||||
const int kMaxCharsPerChar = 106 + UNICHAR_LEN;
|
||||
|
||||
// The recognized text is returned as a char* which is coded
|
||||
// as a UTF8 box file and must be freed with the delete [] operator.
|
||||
char* TessBaseAPI::GetBoxText() {
|
||||
// page_number is a 0-base page index that will appear in the box file.
|
||||
char* TessBaseAPI::GetBoxText(int page_number) {
|
||||
int bottom = image_height_ - (rect_top_ + rect_height_);
|
||||
if (tesseract_ == NULL ||
|
||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||
@ -919,7 +925,8 @@ char* TessBaseAPI::GetBoxText() {
|
||||
page_res_it.forward()) {
|
||||
WERD_RES *word = page_res_it.word();
|
||||
ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
|
||||
ptr);
|
||||
image_width_, image_height_,
|
||||
page_number, ptr);
|
||||
// Just in case...
|
||||
if (ptr - result + kMaxCharsPerChar > total_length)
|
||||
break;
|
||||
|
@ -281,14 +281,17 @@ class TESSDLL_API TessBaseAPI {
|
||||
// The recognized text is returned as a char* which is coded
|
||||
// as UTF8 and must be freed with the delete [] operator.
|
||||
char* GetUTF8Text();
|
||||
// The recognized text is returned as a char* which is coded
|
||||
// as HTML with hOCR markup and must be freed with the delete [] operator.
|
||||
char* GetHOCRText(int page_id);
|
||||
// Make a HTML-formatted string with hOCR markup from the internal
|
||||
// data structures.
|
||||
// STL removed from original patch submission and refactored by rays.
|
||||
// page_id is 1-based and will appear in the output.
|
||||
char* GetHOCRText(int page_id);
|
||||
// The recognized text is returned as a char* which is coded in the same
|
||||
// format as a box file used in training. Returned string must be freed with
|
||||
// the delete [] operator.
|
||||
// Constructs coordinates in the original image - not just the rectangle.
|
||||
char* GetBoxText();
|
||||
// page_number is a 0-base page index that will appear in the box file.
|
||||
char* GetBoxText(int page_number);
|
||||
// The recognized text is returned as a char* which is coded
|
||||
// as UNLV format Latin-1 with specific reject and suspect codes
|
||||
// and must be freed with the delete [] operator.
|
||||
|
@ -101,7 +101,7 @@ char szAppName[] = "Tessedit"; //app name
|
||||
// the value of input_file is ignored - ugly, but true - a consequence of
|
||||
// the way that unlv zone file reading takes the place of a page layout
|
||||
// analyzer.
|
||||
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
|
||||
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
|
||||
tesseract::TessBaseAPI* api, STRING* text_out) {
|
||||
api->SetInputName(input_file);
|
||||
#ifdef HAVE_LIBLEPT
|
||||
@ -120,11 +120,11 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
|
||||
if (tessedit_serial_unlv == 0) {
|
||||
char* text;
|
||||
if (tessedit_create_boxfile)
|
||||
text = api->GetBoxText();
|
||||
text = api->GetBoxText(page_index);
|
||||
else if (tessedit_write_unlv)
|
||||
text = api->GetUNLVText();
|
||||
else if (tessedit_create_hocr)
|
||||
text = api->GetHOCRText(page_id);
|
||||
text = api->GetHOCRText(page_index + 1);
|
||||
else
|
||||
text = api->GetUTF8Text();
|
||||
*text_out += text;
|
||||
@ -232,7 +232,7 @@ int main(int argc, char **argv) {
|
||||
api.SetVariable("applybox_page", page_str);
|
||||
|
||||
// Run tesseract on the page!
|
||||
TesseractImage(argv[1], pix, page + 1, &api, &text_out);
|
||||
TesseractImage(argv[1], NULL, pix, page, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
if (tessedit_page_number >= 0) {
|
||||
break;
|
||||
@ -257,13 +257,13 @@ int main(int argc, char **argv) {
|
||||
exit(1);
|
||||
}
|
||||
tprintf("Page %d : %s\n", page, filename);
|
||||
TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
|
||||
TesseractImage(filename, NULL, pix, page, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
++page;
|
||||
}
|
||||
fclose(fp);
|
||||
} else {
|
||||
TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
|
||||
TesseractImage(argv[1], NULL, pix, 0, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
}
|
||||
@ -297,13 +297,13 @@ int main(int argc, char **argv) {
|
||||
char page_str[kMaxIntSize];
|
||||
snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
|
||||
api.SetVariable("applybox_page", page_str);
|
||||
++page_number;
|
||||
// Read the current page into the Tesseract image.
|
||||
IMAGE image;
|
||||
read_tiff_image(archive, &image);
|
||||
|
||||
// Run tesseract on the page!
|
||||
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
|
||||
++page_number;
|
||||
// Do this while there are more pages in the tiff file.
|
||||
} while (TIFFReadDirectory(archive) &&
|
||||
(page_number <= tessedit_page_number || tessedit_page_number < 0));
|
||||
@ -318,7 +318,7 @@ int main(int argc, char **argv) {
|
||||
if (image.read(image.get_ysize ()) < 0)
|
||||
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
|
||||
invert_image(&image);
|
||||
TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
|
||||
TesseractImage(argv[1], &image, NULL, 0, &api, &text_out);
|
||||
#ifdef _TIFFIO_
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user