Fixed issue 263 with modified patch.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@333 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2010-05-19 18:35:40 +00:00
parent 533a671cff
commit e3e78b076b
4 changed files with 244 additions and 31 deletions

View File

@ -703,6 +703,144 @@ char* TessBaseAPI::GetUTF8Text() {
return result;
}
// Helper returns true if there is a paragraph break between bbox_cur,
// and bbox_prev.
// TODO(rays) improve and incorporate deeper into tesseract, so other
// output methods get the benefit.
static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
int right, int line_height) {
// Check if the distance between lines is larger than the normal leading,
if (fabs(bbox_cur.bottom() - bbox_prev.bottom()) > line_height * 2)
return true;
// Check if the distance between left bounds of the two lines is nearly the
// same as between their right bounds (if so, then both lines probably belong
// to the same paragraph, maybe a centered one).
if (fabs((bbox_cur.left() - bbox_prev.left()) -
(bbox_prev.right() - bbox_cur.right()) < line_height))
return false;
// Check if there is a paragraph indent at this line (either -ve or +ve).
if (fabs(bbox_cur.left() - bbox_prev.left()) > line_height)
return true;
// Check if both current and previous line don't reach the right bound of the
// block, but the distance is different. This will cause all lines in a verse
// to be treated as separate paragraphs, but most probably will not split
// block-quotes to separate lines (at least if the text is justified).
if (fabs(bbox_cur.right() - bbox_prev.right()) > line_height &&
right - bbox_cur.right() > line_height &&
right - bbox_prev.right() > line_height)
return true;
return false;
}
// Helper to add the hOCR for a box to the given hocr_str.
static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
hocr_str->add_str_int("' title=\"bbox ", box.left());
hocr_str->add_str_int(" ", image_height - box.top());
hocr_str->add_str_int(" ", box.right());
hocr_str->add_str_int(" ", image_height - box.bottom());
*hocr_str += "\">";
}
// Make a HTML-formatted string with hOCR markup from the internal
// data structures.
// STL removed from orignal patch submission and refactored by rays.
char* TessBaseAPI::GetHOCRText(int page_id) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
return NULL;
PAGE_RES_IT page_res_it(page_res_);
ROW_RES *row = NULL; // current row
ROW *real_row = NULL, *prev_row = NULL;
BLOCK_RES *block = NULL; // current row
BLOCK *real_block = NULL;
int lcnt = 1, bcnt = 1, wcnt = 1;
STRING hocr_str;
hocr_str.add_str_int("<div class='ocr_page' id='page_", page_id);
hocr_str += "' title='image \"";
hocr_str += *input_file_;
hocr_str.add_str_int("\"; bbox ", rect_left_);
hocr_str.add_str_int(" ", rect_top_);
hocr_str.add_str_int(" ", rect_width_);
hocr_str.add_str_int(" ", rect_height_);
hocr_str += "'>\n";
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
if (block != page_res_it.block ()) {
if (block != NULL) {
hocr_str += "</span>\n</p>\n</div>\n";
}
block = page_res_it.block (); // current row
real_block = block->block;
real_row = NULL;
row = NULL;
hocr_str.add_str_int("<div class='ocr_carea' id='block_", page_id);
hocr_str.add_str_int("_", bcnt++);
AddBoxTohOCR(real_block->bounding_box(), image_height_, &hocr_str);
hocr_str += "\n<p class='ocr_par'>\n";
}
if (row != page_res_it.row ()) {
if (row != NULL) {
hocr_str += "</span>\n";
}
prev_row = real_row;
row = page_res_it.row (); // current row
real_row = row->row;
if (prev_row != NULL &&
IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(),
real_block->bounding_box().right(),
real_row->x_height() + real_row->ascenders()))
hocr_str += "</p>\n<p class='ocr_par'>\n";
hocr_str.add_str_int("<span class='ocr_line' id='line_", page_id);
hocr_str.add_str_int("_", lcnt++);
AddBoxTohOCR(real_row->bounding_box(), image_height_, &hocr_str);
}
WERD_RES *word = page_res_it.word();
WERD_CHOICE* choice = word->best_choice;
if (choice != NULL) {
hocr_str.add_str_int("<span class='ocr_word' id='word_", page_id);
hocr_str.add_str_int("_", wcnt);
AddBoxTohOCR(word->word->bounding_box(), image_height_, &hocr_str);
hocr_str.add_str_int("<span class='xocr_word' id='xword_", page_id);
hocr_str.add_str_int("_", wcnt++);
hocr_str.add_str_int("' title=\"x_wconf ", choice->certainty());
hocr_str += "\">";
if (word->bold > 0)
hocr_str += "<strong>";
if (word->italic > 0)
hocr_str += "<em>";
hocr_str += choice->unichar_string();
if (word->italic > 0)
hocr_str += "</em>";
if (word->bold > 0)
hocr_str += "</strong>";
hocr_str += "</span></span>";
if (!word->word->flag(W_EOL))
hocr_str += " ";
}
}
hocr_str += "</span>\n</p>\n";
hocr_str += "</div>\n</div>\n";
char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
return ret;
}
static int ConvertWordToBoxText(WERD_RES *word,
ROW_RES* row,
int left,

View File

@ -281,6 +281,9 @@ class TESSDLL_API TessBaseAPI {
// The recognized text is returned as a char* which is coded
// as UTF8 and must be freed with the delete [] operator.
char* GetUTF8Text();
// The recognized text is returned as a char* which is coded
// as HTML with hOCR markup and must be freed with the delete [] operator.
char* GetHOCRText(int page_id);
// The recognized text is returned as a char* which is coded in the same
// format as a box file used in training. Returned string must be freed with
// the delete [] operator.

View File

@ -18,6 +18,7 @@
**********************************************************************/
#include "mfcpch.h"
#include <ctype.h>
#include "applybox.h"
#include "control.h"
#include "tessvars.h"
@ -62,6 +63,7 @@ void read_tiff_image(TIFF* tif, IMAGE* image);
#define EXTERN
BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes");
BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup");
BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read");
INT_VAR(tessedit_serial_unlv, 0,
"0->Whole page, 1->serial no adapt, 2->serial with adapt");
@ -100,7 +102,7 @@ char szAppName[] = "Tessedit"; //app name
// the value of input_file is ignored - ugly, but true - a consequence of
// the way that unlv zone file reading takes the place of a page layout
// analyzer.
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix,
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
tesseract::TessBaseAPI* api, STRING* text_out) {
api->SetInputName(input_file);
#ifdef HAVE_LIBLEPT
@ -122,6 +124,8 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix,
text = api->GetBoxText();
else if (tessedit_write_unlv)
text = api->GetUNLVText();
else if (tessedit_create_hocr)
text = api->GetHOCRText(page_id);
else
text = api->GetUTF8Text();
*text_out += text;
@ -198,37 +202,71 @@ int main(int argc, char **argv) {
IMAGE image;
STRING text_out;
int page_number = tessedit_page_number;
if (page_number < 0)
page_number = 0;
FILE* fp = fopen(argv[1], "rb");
if (fp == NULL) {
tprintf("Image file %s cannot be opened!\n", argv[1]);
exit(1);
}
#ifdef HAVE_LIBLEPT
// Use leptonica to read images.
// If the image fails to read, try it as a list of filenames.
PIX* pix = pixRead(argv[1]);
if (pix == NULL) {
FILE* fp = fopen(argv[1], "r");
if (fp == NULL)
READFAILED.error(argv[0], EXIT, argv[1]);
char filename[MAX_PATH];
while (fgets(filename, sizeof(filename), fp) != NULL) {
chomp_string(filename);
pix = pixRead(filename);
if (pix == NULL)
READFAILED.error(argv[0], EXIT, argv[1]);
TesseractImage(argv[1], NULL, pix, &api, &text_out);
int page = page_number;
bool is_tiff = fileFormatIsTiff(fp);
fclose(fp);
if (is_tiff) {
for (; (pix = pixReadTiff(argv[1], page)) != NULL; ++page) {
if (page > 0)
tprintf("Page %d\n", page);
char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page);
api.SetVariable("applybox_page", page_str);
// Run tesseract on the page!
TesseractImage(argv[1], pix, page + 1, &api, &text_out);
pixDestroy(&pix);
if (tessedit_page_number >= 0) {
break;
}
}
} else {
// The file is not a tiff file, so use the general pixRead function.
// If the image fails to read, try it as a list of filenames.
PIX* pix = pixRead(argv[1]);
if (pix == NULL) {
FILE* fp = fopen(argv[1], "r");
if (fp == NULL) {
tprintf("File %s cannot be opened!\n", argv[1]);
exit(1);
}
char filename[MAX_PATH];
while (fgets(filename, sizeof(filename), fp) != NULL) {
chomp_string(filename);
pix = pixRead(filename);
if (pix == NULL) {
tprintf("Image file %s cannot be read!\n", filename);
exit(1);
}
tprintf("Page %d : %s\n", page, filename);
TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
pixDestroy(&pix);
++page;
}
fclose(fp);
} else {
TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
pixDestroy(&pix);
}
fclose(fp);
} else {
TesseractImage(argv[1], NULL, pix, &api, &text_out);
pixDestroy(&pix);
}
#else
#ifdef _TIFFIO_
int len = strlen(argv[1]);
if (len > 3 && strcmp("tif", argv[1] + len - 3) == 0) {
char* ext = new char[5];
for (int i=4; i>=0; i--)
ext[4-i] = (char) tolower((int) argv[1][len - i]);
if (len > 3 && (strcmp("tif", ext + 1) == 0 || strcmp("tiff", ext) == 0)) {
// Use libtiff to read a tif file so multi-page can be handled.
// The page number so the tiff file can be closed and reopened.
int page_number = tessedit_page_number;
if (page_number < 0)
page_number = 0;
TIFF* archive = NULL;
do {
// Since libtiff keeps all read images in memory we have to close the
@ -256,7 +294,7 @@ int main(int argc, char **argv) {
read_tiff_image(archive, &image);
// Run tesseract on the page!
TesseractImage(argv[1], &image, NULL, &api, &text_out);
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
// Do this while there are more pages in the tiff file.
} while (TIFFReadDirectory(archive) &&
(page_number <= tessedit_page_number || tessedit_page_number < 0));
@ -268,19 +306,35 @@ int main(int argc, char **argv) {
READFAILED.error (argv[0], EXIT, argv[1]);
if (image.read(image.get_ysize ()) < 0)
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
TesseractImage(argv[1], &image, NULL, &api, &text_out);
invert_image(&image);
TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
#ifdef _TIFFIO_
}
#endif
#endif // HAVE_LIBLEPT
bool output_hocr = tessedit_create_hocr;
outfile = argv[2];
outfile += ".txt";
FILE* fp = fopen(outfile.string(), "w");
if (fp != NULL) {
fwrite(text_out.string(), 1, text_out.length(), fp);
fclose(fp);
outfile += output_hocr ? ".html" : ".txt";
fp = fopen(outfile.string(), "w");
if (fp == NULL) {
tprintf("Cannot create output file %s\n", outfile.string());
exit(1);
}
if (output_hocr) {
const char html_header[] =
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
"<html>\n<head>\n<title></title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
"</head>\n<body>\n";
fprintf(fp, "%s", html_header);
}
fwrite(text_out.string(), 1, text_out.length(), fp);
if (output_hocr)
fprintf(fp, "</body>\n</html>\n");
fclose(fp);
return 0; //Normal exit
}

View File

@ -65,12 +65,30 @@ static IMAGETYPE imagetypes[] = { {
read_tif_image,
write_intel_tif
},
{
"TIFF",
open_tif_image,
read_tif_image,
write_moto_tif
},
{
"tiff",
open_tif_image,
read_tif_image,
write_intel_tif
},
{
"bmp",
open_bmp_image,
read_bmp_image,
write_bmp_image
},
{
"BMP",
open_bmp_image,
read_bmp_image,
write_bmp_image
},
}; //image readers/writers
#define MAXIMAGETYPES (sizeof(imagetypes)/sizeof(IMAGETYPE))