mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-08-06 13:56:47 +08:00
Fixed issue 263 with modified patch.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@333 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
533a671cff
commit
e3e78b076b
138
api/baseapi.cpp
138
api/baseapi.cpp
@ -703,6 +703,144 @@ char* TessBaseAPI::GetUTF8Text() {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper returns true if there is a paragraph break between bbox_cur,
|
||||
// and bbox_prev.
|
||||
// TODO(rays) improve and incorporate deeper into tesseract, so other
|
||||
// output methods get the benefit.
|
||||
static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
|
||||
int right, int line_height) {
|
||||
// Check if the distance between lines is larger than the normal leading,
|
||||
if (fabs(bbox_cur.bottom() - bbox_prev.bottom()) > line_height * 2)
|
||||
return true;
|
||||
|
||||
// Check if the distance between left bounds of the two lines is nearly the
|
||||
// same as between their right bounds (if so, then both lines probably belong
|
||||
// to the same paragraph, maybe a centered one).
|
||||
if (fabs((bbox_cur.left() - bbox_prev.left()) -
|
||||
(bbox_prev.right() - bbox_cur.right()) < line_height))
|
||||
return false;
|
||||
|
||||
// Check if there is a paragraph indent at this line (either -ve or +ve).
|
||||
if (fabs(bbox_cur.left() - bbox_prev.left()) > line_height)
|
||||
return true;
|
||||
|
||||
// Check if both current and previous line don't reach the right bound of the
|
||||
// block, but the distance is different. This will cause all lines in a verse
|
||||
// to be treated as separate paragraphs, but most probably will not split
|
||||
// block-quotes to separate lines (at least if the text is justified).
|
||||
if (fabs(bbox_cur.right() - bbox_prev.right()) > line_height &&
|
||||
right - bbox_cur.right() > line_height &&
|
||||
right - bbox_prev.right() > line_height)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helper to add the hOCR for a box to the given hocr_str.
|
||||
static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) {
|
||||
hocr_str->add_str_int("' title=\"bbox ", box.left());
|
||||
hocr_str->add_str_int(" ", image_height - box.top());
|
||||
hocr_str->add_str_int(" ", box.right());
|
||||
hocr_str->add_str_int(" ", image_height - box.bottom());
|
||||
*hocr_str += "\">";
|
||||
}
|
||||
|
||||
// Make a HTML-formatted string with hOCR markup from the internal
|
||||
// data structures.
|
||||
// STL removed from orignal patch submission and refactored by rays.
|
||||
char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||
if (tesseract_ == NULL ||
|
||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||
return NULL;
|
||||
|
||||
PAGE_RES_IT page_res_it(page_res_);
|
||||
ROW_RES *row = NULL; // current row
|
||||
ROW *real_row = NULL, *prev_row = NULL;
|
||||
BLOCK_RES *block = NULL; // current row
|
||||
BLOCK *real_block = NULL;
|
||||
int lcnt = 1, bcnt = 1, wcnt = 1;
|
||||
|
||||
STRING hocr_str;
|
||||
|
||||
hocr_str.add_str_int("<div class='ocr_page' id='page_", page_id);
|
||||
hocr_str += "' title='image \"";
|
||||
hocr_str += *input_file_;
|
||||
hocr_str.add_str_int("\"; bbox ", rect_left_);
|
||||
hocr_str.add_str_int(" ", rect_top_);
|
||||
hocr_str.add_str_int(" ", rect_width_);
|
||||
hocr_str.add_str_int(" ", rect_height_);
|
||||
hocr_str += "'>\n";
|
||||
|
||||
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
||||
page_res_it.forward()) {
|
||||
if (block != page_res_it.block ()) {
|
||||
|
||||
if (block != NULL) {
|
||||
hocr_str += "</span>\n</p>\n</div>\n";
|
||||
}
|
||||
|
||||
block = page_res_it.block (); // current row
|
||||
real_block = block->block;
|
||||
real_row = NULL;
|
||||
row = NULL;
|
||||
|
||||
hocr_str.add_str_int("<div class='ocr_carea' id='block_", page_id);
|
||||
hocr_str.add_str_int("_", bcnt++);
|
||||
AddBoxTohOCR(real_block->bounding_box(), image_height_, &hocr_str);
|
||||
hocr_str += "\n<p class='ocr_par'>\n";
|
||||
}
|
||||
if (row != page_res_it.row ()) {
|
||||
|
||||
if (row != NULL) {
|
||||
hocr_str += "</span>\n";
|
||||
}
|
||||
prev_row = real_row;
|
||||
|
||||
row = page_res_it.row (); // current row
|
||||
real_row = row->row;
|
||||
|
||||
if (prev_row != NULL &&
|
||||
IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(),
|
||||
real_block->bounding_box().right(),
|
||||
real_row->x_height() + real_row->ascenders()))
|
||||
hocr_str += "</p>\n<p class='ocr_par'>\n";
|
||||
|
||||
hocr_str.add_str_int("<span class='ocr_line' id='line_", page_id);
|
||||
hocr_str.add_str_int("_", lcnt++);
|
||||
AddBoxTohOCR(real_row->bounding_box(), image_height_, &hocr_str);
|
||||
}
|
||||
|
||||
WERD_RES *word = page_res_it.word();
|
||||
WERD_CHOICE* choice = word->best_choice;
|
||||
if (choice != NULL) {
|
||||
hocr_str.add_str_int("<span class='ocr_word' id='word_", page_id);
|
||||
hocr_str.add_str_int("_", wcnt);
|
||||
AddBoxTohOCR(word->word->bounding_box(), image_height_, &hocr_str);
|
||||
hocr_str.add_str_int("<span class='xocr_word' id='xword_", page_id);
|
||||
hocr_str.add_str_int("_", wcnt++);
|
||||
hocr_str.add_str_int("' title=\"x_wconf ", choice->certainty());
|
||||
hocr_str += "\">";
|
||||
if (word->bold > 0)
|
||||
hocr_str += "<strong>";
|
||||
if (word->italic > 0)
|
||||
hocr_str += "<em>";
|
||||
hocr_str += choice->unichar_string();
|
||||
if (word->italic > 0)
|
||||
hocr_str += "</em>";
|
||||
if (word->bold > 0)
|
||||
hocr_str += "</strong>";
|
||||
hocr_str += "</span></span>";
|
||||
if (!word->word->flag(W_EOL))
|
||||
hocr_str += " ";
|
||||
}
|
||||
}
|
||||
hocr_str += "</span>\n</p>\n";
|
||||
hocr_str += "</div>\n</div>\n";
|
||||
char *ret = new char[hocr_str.length() + 1];
|
||||
strcpy(ret, hocr_str.string());
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ConvertWordToBoxText(WERD_RES *word,
|
||||
ROW_RES* row,
|
||||
int left,
|
||||
|
@ -281,6 +281,9 @@ class TESSDLL_API TessBaseAPI {
|
||||
// The recognized text is returned as a char* which is coded
|
||||
// as UTF8 and must be freed with the delete [] operator.
|
||||
char* GetUTF8Text();
|
||||
// The recognized text is returned as a char* which is coded
|
||||
// as HTML with hOCR markup and must be freed with the delete [] operator.
|
||||
char* GetHOCRText(int page_id);
|
||||
// The recognized text is returned as a char* which is coded in the same
|
||||
// format as a box file used in training. Returned string must be freed with
|
||||
// the delete [] operator.
|
||||
|
@ -18,6 +18,7 @@
|
||||
**********************************************************************/
|
||||
|
||||
#include "mfcpch.h"
|
||||
#include <ctype.h>
|
||||
#include "applybox.h"
|
||||
#include "control.h"
|
||||
#include "tessvars.h"
|
||||
@ -62,6 +63,7 @@ void read_tiff_image(TIFF* tif, IMAGE* image);
|
||||
#define EXTERN
|
||||
|
||||
BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes");
|
||||
BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup");
|
||||
BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read");
|
||||
INT_VAR(tessedit_serial_unlv, 0,
|
||||
"0->Whole page, 1->serial no adapt, 2->serial with adapt");
|
||||
@ -100,7 +102,7 @@ char szAppName[] = "Tessedit"; //app name
|
||||
// the value of input_file is ignored - ugly, but true - a consequence of
|
||||
// the way that unlv zone file reading takes the place of a page layout
|
||||
// analyzer.
|
||||
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix,
|
||||
void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_id,
|
||||
tesseract::TessBaseAPI* api, STRING* text_out) {
|
||||
api->SetInputName(input_file);
|
||||
#ifdef HAVE_LIBLEPT
|
||||
@ -122,6 +124,8 @@ void TesseractImage(const char* input_file, IMAGE* image, Pix* pix,
|
||||
text = api->GetBoxText();
|
||||
else if (tessedit_write_unlv)
|
||||
text = api->GetUNLVText();
|
||||
else if (tessedit_create_hocr)
|
||||
text = api->GetHOCRText(page_id);
|
||||
else
|
||||
text = api->GetUTF8Text();
|
||||
*text_out += text;
|
||||
@ -198,37 +202,71 @@ int main(int argc, char **argv) {
|
||||
|
||||
IMAGE image;
|
||||
STRING text_out;
|
||||
int page_number = tessedit_page_number;
|
||||
if (page_number < 0)
|
||||
page_number = 0;
|
||||
FILE* fp = fopen(argv[1], "rb");
|
||||
if (fp == NULL) {
|
||||
tprintf("Image file %s cannot be opened!\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
#ifdef HAVE_LIBLEPT
|
||||
// Use leptonica to read images.
|
||||
// If the image fails to read, try it as a list of filenames.
|
||||
PIX* pix = pixRead(argv[1]);
|
||||
if (pix == NULL) {
|
||||
FILE* fp = fopen(argv[1], "r");
|
||||
if (fp == NULL)
|
||||
READFAILED.error(argv[0], EXIT, argv[1]);
|
||||
char filename[MAX_PATH];
|
||||
while (fgets(filename, sizeof(filename), fp) != NULL) {
|
||||
chomp_string(filename);
|
||||
pix = pixRead(filename);
|
||||
if (pix == NULL)
|
||||
READFAILED.error(argv[0], EXIT, argv[1]);
|
||||
TesseractImage(argv[1], NULL, pix, &api, &text_out);
|
||||
int page = page_number;
|
||||
bool is_tiff = fileFormatIsTiff(fp);
|
||||
fclose(fp);
|
||||
|
||||
if (is_tiff) {
|
||||
for (; (pix = pixReadTiff(argv[1], page)) != NULL; ++page) {
|
||||
if (page > 0)
|
||||
tprintf("Page %d\n", page);
|
||||
char page_str[kMaxIntSize];
|
||||
snprintf(page_str, kMaxIntSize - 1, "%d", page);
|
||||
api.SetVariable("applybox_page", page_str);
|
||||
|
||||
// Run tesseract on the page!
|
||||
TesseractImage(argv[1], pix, page + 1, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
if (tessedit_page_number >= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The file is not a tiff file, so use the general pixRead function.
|
||||
// If the image fails to read, try it as a list of filenames.
|
||||
PIX* pix = pixRead(argv[1]);
|
||||
if (pix == NULL) {
|
||||
FILE* fp = fopen(argv[1], "r");
|
||||
if (fp == NULL) {
|
||||
tprintf("File %s cannot be opened!\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
char filename[MAX_PATH];
|
||||
while (fgets(filename, sizeof(filename), fp) != NULL) {
|
||||
chomp_string(filename);
|
||||
pix = pixRead(filename);
|
||||
if (pix == NULL) {
|
||||
tprintf("Image file %s cannot be read!\n", filename);
|
||||
exit(1);
|
||||
}
|
||||
tprintf("Page %d : %s\n", page, filename);
|
||||
TesseractImage(filename, NULL, pix, page + 1, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
++page;
|
||||
}
|
||||
fclose(fp);
|
||||
} else {
|
||||
TesseractImage(argv[1], NULL, pix, 1, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
fclose(fp);
|
||||
} else {
|
||||
TesseractImage(argv[1], NULL, pix, &api, &text_out);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
#else
|
||||
#ifdef _TIFFIO_
|
||||
int len = strlen(argv[1]);
|
||||
if (len > 3 && strcmp("tif", argv[1] + len - 3) == 0) {
|
||||
char* ext = new char[5];
|
||||
for (int i=4; i>=0; i--)
|
||||
ext[4-i] = (char) tolower((int) argv[1][len - i]);
|
||||
if (len > 3 && (strcmp("tif", ext + 1) == 0 || strcmp("tiff", ext) == 0)) {
|
||||
// Use libtiff to read a tif file so multi-page can be handled.
|
||||
// The page number so the tiff file can be closed and reopened.
|
||||
int page_number = tessedit_page_number;
|
||||
if (page_number < 0)
|
||||
page_number = 0;
|
||||
TIFF* archive = NULL;
|
||||
do {
|
||||
// Since libtiff keeps all read images in memory we have to close the
|
||||
@ -256,7 +294,7 @@ int main(int argc, char **argv) {
|
||||
read_tiff_image(archive, &image);
|
||||
|
||||
// Run tesseract on the page!
|
||||
TesseractImage(argv[1], &image, NULL, &api, &text_out);
|
||||
TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
|
||||
// Do this while there are more pages in the tiff file.
|
||||
} while (TIFFReadDirectory(archive) &&
|
||||
(page_number <= tessedit_page_number || tessedit_page_number < 0));
|
||||
@ -268,19 +306,35 @@ int main(int argc, char **argv) {
|
||||
READFAILED.error (argv[0], EXIT, argv[1]);
|
||||
if (image.read(image.get_ysize ()) < 0)
|
||||
MEMORY_OUT.error(argv[0], EXIT, "Read of image %s", argv[1]);
|
||||
TesseractImage(argv[1], &image, NULL, &api, &text_out);
|
||||
invert_image(&image);
|
||||
TesseractImage(argv[1], &image, NULL, 1, &api, &text_out);
|
||||
#ifdef _TIFFIO_
|
||||
}
|
||||
#endif
|
||||
#endif // HAVE_LIBLEPT
|
||||
|
||||
bool output_hocr = tessedit_create_hocr;
|
||||
outfile = argv[2];
|
||||
outfile += ".txt";
|
||||
FILE* fp = fopen(outfile.string(), "w");
|
||||
if (fp != NULL) {
|
||||
fwrite(text_out.string(), 1, text_out.length(), fp);
|
||||
fclose(fp);
|
||||
outfile += output_hocr ? ".html" : ".txt";
|
||||
fp = fopen(outfile.string(), "w");
|
||||
if (fp == NULL) {
|
||||
tprintf("Cannot create output file %s\n", outfile.string());
|
||||
exit(1);
|
||||
}
|
||||
if (output_hocr) {
|
||||
const char html_header[] =
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
|
||||
" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
|
||||
"<html>\n<head>\n<title></title>\n"
|
||||
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
|
||||
"charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
|
||||
"</head>\n<body>\n";
|
||||
fprintf(fp, "%s", html_header);
|
||||
}
|
||||
fwrite(text_out.string(), 1, text_out.length(), fp);
|
||||
if (output_hocr)
|
||||
fprintf(fp, "</body>\n</html>\n");
|
||||
fclose(fp);
|
||||
|
||||
return 0; //Normal exit
|
||||
}
|
||||
|
@ -65,12 +65,30 @@ static IMAGETYPE imagetypes[] = { {
|
||||
read_tif_image,
|
||||
write_intel_tif
|
||||
},
|
||||
{
|
||||
"TIFF",
|
||||
open_tif_image,
|
||||
read_tif_image,
|
||||
write_moto_tif
|
||||
},
|
||||
{
|
||||
"tiff",
|
||||
open_tif_image,
|
||||
read_tif_image,
|
||||
write_intel_tif
|
||||
},
|
||||
{
|
||||
"bmp",
|
||||
open_bmp_image,
|
||||
read_bmp_image,
|
||||
write_bmp_image
|
||||
},
|
||||
{
|
||||
"BMP",
|
||||
open_bmp_image,
|
||||
read_bmp_image,
|
||||
write_bmp_image
|
||||
},
|
||||
}; //image readers/writers
|
||||
|
||||
#define MAXIMAGETYPES (sizeof(imagetypes)/sizeof(IMAGETYPE))
|
||||
|
Loading…
Reference in New Issue
Block a user