mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
fixed hocr to produce valid document (acording http://validator.w3.org/) - issue http://code.google.com/p/tesseract-ocr/issues/detail?id=401
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@525 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
515ac2d3f0
commit
7511d76315
@ -720,18 +720,18 @@ static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
|
||||
// Check if the distance between lines is larger than the normal leading,
|
||||
if (fabs((float)(bbox_cur.bottom() - bbox_prev.bottom())) > line_height * 2)
|
||||
return true;
|
||||
|
||||
|
||||
// Check if the distance between left bounds of the two lines is nearly the
|
||||
// same as between their right bounds (if so, then both lines probably belong
|
||||
// to the same paragraph, maybe a centered one).
|
||||
if (fabs((float)((bbox_cur.left() - bbox_prev.left()) -
|
||||
(bbox_prev.right() - bbox_cur.right()))) < line_height)
|
||||
return false;
|
||||
|
||||
|
||||
// Check if there is a paragraph indent at this line (either -ve or +ve).
|
||||
if (fabs((float)(bbox_cur.left() - bbox_prev.left())) > line_height)
|
||||
return true;
|
||||
|
||||
|
||||
// Check if both current and previous line don't reach the right bound of the
|
||||
// block, but the distance is different. This will cause all lines in a verse
|
||||
// to be treated as separate paragraphs, but most probably will not split
|
||||
@ -740,7 +740,7 @@ static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
|
||||
right - bbox_cur.right() > line_height &&
|
||||
right - bbox_prev.right() > line_height)
|
||||
return true;
|
||||
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -761,7 +761,7 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||
if (tesseract_ == NULL ||
|
||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||
return NULL;
|
||||
|
||||
|
||||
PAGE_RES_IT page_res_it(page_res_);
|
||||
ROW_RES *row = NULL; // current row
|
||||
ROW *real_row = NULL, *prev_row = NULL;
|
||||
@ -783,37 +783,37 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
||||
page_res_it.forward()) {
|
||||
if (block != page_res_it.block ()) {
|
||||
|
||||
|
||||
if (block != NULL) {
|
||||
hocr_str += "</span>\n</p>\n</div>\n";
|
||||
}
|
||||
|
||||
|
||||
block = page_res_it.block (); // current row
|
||||
real_block = block->block;
|
||||
real_row = NULL;
|
||||
row = NULL;
|
||||
|
||||
|
||||
hocr_str.add_str_int("<div class='ocr_carea' id='block_", page_id);
|
||||
hocr_str.add_str_int("_", bcnt++);
|
||||
AddBoxTohOCR(real_block->bounding_box(), image_height_, &hocr_str);
|
||||
hocr_str += "\n<p class='ocr_par'>\n";
|
||||
}
|
||||
if (row != page_res_it.row ()) {
|
||||
|
||||
|
||||
if (row != NULL) {
|
||||
hocr_str += "</span>\n";
|
||||
}
|
||||
prev_row = real_row;
|
||||
|
||||
|
||||
row = page_res_it.row (); // current row
|
||||
real_row = row->row;
|
||||
|
||||
if (prev_row != NULL &&
|
||||
|
||||
if (prev_row != NULL &&
|
||||
IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(),
|
||||
real_block->bounding_box().right(),
|
||||
real_row->x_height() + real_row->ascenders()))
|
||||
hocr_str += "</p>\n<p class='ocr_par'>\n";
|
||||
|
||||
|
||||
hocr_str.add_str_int("<span class='ocr_line' id='line_", page_id);
|
||||
hocr_str.add_str_int("_", lcnt++);
|
||||
AddBoxTohOCR(real_row->bounding_box(), image_height_, &hocr_str);
|
||||
@ -834,10 +834,10 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||
if (word->italic > 0)
|
||||
hocr_str += "<em>";
|
||||
int i;
|
||||
// escape special characters
|
||||
// escape special characters
|
||||
for (i = 0;
|
||||
choice->unichar_string()[i] != '\0';
|
||||
i++) {
|
||||
i++) {
|
||||
if (choice->unichar_string()[i] == '<') { hocr_str += "<"; }
|
||||
else if (choice->unichar_string()[i] == '>') { hocr_str += ">"; }
|
||||
else if (choice->unichar_string()[i] == '&') { hocr_str += "&"; }
|
||||
@ -854,8 +854,10 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
|
||||
hocr_str += " ";
|
||||
}
|
||||
}
|
||||
hocr_str += "</span>\n</p>\n";
|
||||
hocr_str += "</div>\n</div>\n";
|
||||
if (block != NULL)
|
||||
hocr_str += "</span>\n</p>\n</div>\n";
|
||||
hocr_str += "</div>\n";
|
||||
|
||||
char *ret = new char[hocr_str.length() + 1];
|
||||
strcpy(ret, hocr_str.string());
|
||||
return ret;
|
||||
|
@ -391,12 +391,14 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
if (output_hocr) {
|
||||
const char html_header[] =
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
|
||||
" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
|
||||
"<html>\n<head>\n<title></title>\n"
|
||||
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
|
||||
"charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
|
||||
"</head>\n<body>\n";
|
||||
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
|
||||
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
|
||||
"<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n"
|
||||
" <title>OCR Output</title>\n"
|
||||
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
|
||||
"charset=utf-8\" />\n <meta name='ocr-system' "
|
||||
"content='tesseract-ocr 3.00' />\n <meta name='ocr-capabilities'"
|
||||
" content='ocr_page' />\n</head>\n<body>\n";
|
||||
fprintf(fout, "%s", html_header);
|
||||
}
|
||||
fwrite(text_out.string(), 1, text_out.length(), fout);
|
||||
|
@ -1,3 +1,3 @@
|
||||
datadir = @datadir@/tessdata/configs
|
||||
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
|
||||
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
|
||||
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
|
||||
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
|
||||
|
@ -198,8 +198,8 @@ target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
|
||||
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
|
||||
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
|
||||
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
|
||||
all: all-am
|
||||
|
||||
.SUFFIXES:
|
||||
|
Loading…
Reference in New Issue
Block a user