diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 1bf1b43d0..88bdf13cf 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1467,9 +1467,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { bool para_is_ltr = true; // Default direction is LTR const char* paragraph_lang = NULL; bool font_info = false; + bool hocr_boxes = false; GetBoolVariable("hocr_font_info", &font_info); + GetBoolVariable("hocr_char_boxes", &hocr_boxes); STRING hocr_str(""); + STRING word(""); if (input_file_ == NULL) SetInputName(NULL); @@ -1564,7 +1567,6 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { } hocr_str.add_str_int("; x_fsize ", pointsize); } - hocr_str += "'"; const char* lang = res_it->WordRecognitionLanguage(); if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { hocr_str += " lang='"; @@ -1580,20 +1582,34 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { default: // Do nothing. break; } - hocr_str += ">"; + + word = ""; + if (hocr_boxes) { + hocr_str += "; x_bboxes"; + } + do { + const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); + if (grapheme && grapheme[0] != 0) { + word += HOcrEscape(grapheme); + } + delete []grapheme; + if (hocr_boxes) { + res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); + hocr_str.add_str_int(" ", left); + hocr_str.add_str_int(" ", top); + hocr_str.add_str_int(" ", right); + hocr_str.add_str_int(" ", bottom); + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + + hocr_str += "'>"; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); if (bold) hocr_str += ""; if (italic) hocr_str += ""; - do { - const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); - if (grapheme && grapheme[0] != 0) { - hocr_str += HOcrEscape(grapheme); - } - delete []grapheme; - res_it->Next(RIL_SYMBOL); - } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + hocr_str += word; if (italic) hocr_str += ""; if (bold) hocr_str += ""; hocr_str += " "; diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 8db50fbd5..ca4f45a6f 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -273,6 +273,8 @@ Tesseract::Tesseract() "Mark v.bad words for tilde crunch", this->params()), BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params()), + BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output", + this->params()), BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params()), BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 91d25bc8a..ee07b66cd 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -933,6 +933,8 @@ class Tesseract : public Wordrec { "Mark v.bad words for tilde crunch"); BOOL_VAR_H(hocr_font_info, false, "Add font info to hocr output"); + BOOL_VAR_H(hocr_char_boxes, false, + "Add coordinates for each character to hocr output"); BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?"); BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?"); double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");