git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@525 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
zdenop@gmail.com 2010-11-17 20:03:58 +00:00
parent 515ac2d3f0
commit 7511d76315
4 changed files with 31 additions and 27 deletions

View File

@ -720,18 +720,18 @@ static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
// Check if the distance between lines is larger than the normal leading,
if (fabs((float)(bbox_cur.bottom() - bbox_prev.bottom())) > line_height * 2)
return true;
// Check if the distance between left bounds of the two lines is nearly the
// same as between their right bounds (if so, then both lines probably belong
// to the same paragraph, maybe a centered one).
if (fabs((float)((bbox_cur.left() - bbox_prev.left()) -
(bbox_prev.right() - bbox_cur.right()))) < line_height)
return false;
// Check if there is a paragraph indent at this line (either -ve or +ve).
if (fabs((float)(bbox_cur.left() - bbox_prev.left())) > line_height)
return true;
// Check if both current and previous line don't reach the right bound of the
// block, but the distance is different. This will cause all lines in a verse
// to be treated as separate paragraphs, but most probably will not split
@ -740,7 +740,7 @@ static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
right - bbox_cur.right() > line_height &&
right - bbox_prev.right() > line_height)
return true;
return false;
}
@ -761,7 +761,7 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
return NULL;
PAGE_RES_IT page_res_it(page_res_);
ROW_RES *row = NULL; // current row
ROW *real_row = NULL, *prev_row = NULL;
@ -783,37 +783,37 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
if (block != page_res_it.block ()) {
if (block != NULL) {
hocr_str += "</span>\n</p>\n</div>\n";
}
block = page_res_it.block (); // current row
real_block = block->block;
real_row = NULL;
row = NULL;
hocr_str.add_str_int("<div class='ocr_carea' id='block_", page_id);
hocr_str.add_str_int("_", bcnt++);
AddBoxTohOCR(real_block->bounding_box(), image_height_, &hocr_str);
hocr_str += "\n<p class='ocr_par'>\n";
}
if (row != page_res_it.row ()) {
if (row != NULL) {
hocr_str += "</span>\n";
}
prev_row = real_row;
row = page_res_it.row (); // current row
real_row = row->row;
if (prev_row != NULL &&
if (prev_row != NULL &&
IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(),
real_block->bounding_box().right(),
real_row->x_height() + real_row->ascenders()))
hocr_str += "</p>\n<p class='ocr_par'>\n";
hocr_str.add_str_int("<span class='ocr_line' id='line_", page_id);
hocr_str.add_str_int("_", lcnt++);
AddBoxTohOCR(real_row->bounding_box(), image_height_, &hocr_str);
@ -834,10 +834,10 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
if (word->italic > 0)
hocr_str += "<em>";
int i;
// escape special characters
// escape special characters
for (i = 0;
choice->unichar_string()[i] != '\0';
i++) {
i++) {
if (choice->unichar_string()[i] == '<') { hocr_str += "&lt;"; }
else if (choice->unichar_string()[i] == '>') { hocr_str += "&gt;"; }
else if (choice->unichar_string()[i] == '&') { hocr_str += "&amp;"; }
@ -854,8 +854,10 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
hocr_str += " ";
}
}
hocr_str += "</span>\n</p>\n";
hocr_str += "</div>\n</div>\n";
if (block != NULL)
hocr_str += "</span>\n</p>\n</div>\n";
hocr_str += "</div>\n";
char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
return ret;

View File

@ -391,12 +391,14 @@ int main(int argc, char **argv) {
}
if (output_hocr) {
const char html_header[] =
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
"<html>\n<head>\n<title></title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
"</head>\n<body>\n";
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n"
" <title>OCR Output</title>\n"
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" />\n <meta name='ocr-system' "
"content='tesseract-ocr 3.00' />\n <meta name='ocr-capabilities'"
" content='ocr_page' />\n</head>\n<body>\n";
fprintf(fout, "%s", html_header);
}
fwrite(text_out.string(), 1, text_out.length(), fout);

View File

@ -1,3 +1,3 @@
datadir = @datadir@/tessdata/configs
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr

View File

@ -198,8 +198,8 @@ target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
all: all-am
.SUFFIXES: