mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
Only generate dir for HOCR when needed - fixes #208
Takes advantage of inheritance and dir="ltr" default to: - only generate paragraph dirs which are not ltr - only generate word dirs which don't match enclosing paragraph Tested against LTR, RTL, and mixed direction files. Files for the latter two cases are in a separate commit on the ltr-test-files branch.
This commit is contained in:
parent
c3ad0de69b
commit
381b3a56c6
@ -1432,6 +1432,7 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
|||||||
|
|
||||||
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
|
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
|
||||||
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
||||||
|
bool para_is_ltr = true; // Default direction is LTR
|
||||||
bool font_info = false;
|
bool font_info = false;
|
||||||
GetBoolVariable("hocr_font_info", &font_info);
|
GetBoolVariable("hocr_font_info", &font_info);
|
||||||
|
|
||||||
@ -1481,15 +1482,15 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
|||||||
|
|
||||||
// Open any new block/paragraph/textline.
|
// Open any new block/paragraph/textline.
|
||||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||||
|
para_is_ltr = true; // reset to default direction
|
||||||
hocr_str += " <div class='ocr_carea'";
|
hocr_str += " <div class='ocr_carea'";
|
||||||
AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
|
AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
|
||||||
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
|
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
|
||||||
}
|
}
|
||||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||||
hocr_str += "\n <p class='ocr_par'";
|
hocr_str += "\n <p class='ocr_par'";
|
||||||
if (res_it->ParagraphIsLtr()) {
|
para_is_ltr = res_it->ParagraphIsLtr();
|
||||||
hocr_str += " dir='ltr'";
|
if (!para_is_ltr) {
|
||||||
} else {
|
|
||||||
hocr_str += " dir='rtl'";
|
hocr_str += " dir='rtl'";
|
||||||
}
|
}
|
||||||
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
|
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
|
||||||
@ -1531,8 +1532,11 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
|||||||
hocr_str += "'";
|
hocr_str += "'";
|
||||||
}
|
}
|
||||||
switch (res_it->WordDirection()) {
|
switch (res_it->WordDirection()) {
|
||||||
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
|
// Only emit direction if different from current paragraph direction
|
||||||
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
|
case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str += " dir='ltr'"; break;
|
||||||
|
case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str += " dir='rtl'"; break;
|
||||||
|
case DIR_MIX:
|
||||||
|
case DIR_NEUTRAL:
|
||||||
default: // Do nothing.
|
default: // Do nothing.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1562,6 +1566,7 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
|||||||
if (last_word_in_para) {
|
if (last_word_in_para) {
|
||||||
hocr_str += "\n </p>\n";
|
hocr_str += "\n </p>\n";
|
||||||
pcnt++;
|
pcnt++;
|
||||||
|
para_is_ltr = true; // back to default direction
|
||||||
}
|
}
|
||||||
if (last_word_in_block) {
|
if (last_word_in_block) {
|
||||||
hocr_str += " </div>\n";
|
hocr_str += " </div>\n";
|
||||||
|
Loading…
Reference in New Issue
Block a user