mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Only generate dir for HOCR when needed - fixes #208
Takes advantage of inheritance and dir="ltr" default to: - only generate paragraph dirs which are not ltr - only generate word dirs which don't match enclosing paragraph Tested against LTR, RTL, and mixed direction files. Files for the latter two cases are in a separate commit on the ltr-test-files branch.
This commit is contained in:
parent
c3ad0de69b
commit
381b3a56c6
@ -1432,6 +1432,7 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
||||
|
||||
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
|
||||
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
||||
bool para_is_ltr = true; // Default direction is LTR
|
||||
bool font_info = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
|
||||
@ -1481,15 +1482,15 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
||||
|
||||
// Open any new block/paragraph/textline.
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
para_is_ltr = true; // reset to default direction
|
||||
hocr_str += " <div class='ocr_carea'";
|
||||
AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
|
||||
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
hocr_str += "\n <p class='ocr_par'";
|
||||
if (res_it->ParagraphIsLtr()) {
|
||||
hocr_str += " dir='ltr'";
|
||||
} else {
|
||||
para_is_ltr = res_it->ParagraphIsLtr();
|
||||
if (!para_is_ltr) {
|
||||
hocr_str += " dir='rtl'";
|
||||
}
|
||||
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
|
||||
@ -1531,8 +1532,11 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
||||
hocr_str += "'";
|
||||
}
|
||||
switch (res_it->WordDirection()) {
|
||||
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
|
||||
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
|
||||
// Only emit direction if different from current paragraph direction
|
||||
case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str += " dir='ltr'"; break;
|
||||
case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str += " dir='rtl'"; break;
|
||||
case DIR_MIX:
|
||||
case DIR_NEUTRAL:
|
||||
default: // Do nothing.
|
||||
break;
|
||||
}
|
||||
@ -1562,6 +1566,7 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
||||
if (last_word_in_para) {
|
||||
hocr_str += "\n </p>\n";
|
||||
pcnt++;
|
||||
para_is_ltr = true; // back to default direction
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
hocr_str += " </div>\n";
|
||||
|
Loading…
Reference in New Issue
Block a user