Merge pull request #310 from nickjwhite/hocrcharboxes

Character boxes in hOCR output
This commit is contained in:
zdenop 2019-01-14 19:19:04 +01:00 committed by GitHub
commit f75b2c1948
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 15 additions and 0 deletions

View File

@ -135,7 +135,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
bool para_is_ltr = true; // Default direction is LTR
const char* paragraph_lang = nullptr;
bool font_info = false;
bool hocr_boxes = false;
GetBoolVariable("hocr_font_info", &font_info);
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
if (input_file_ == nullptr) SetInputName(nullptr);
@ -264,7 +266,16 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
const std::unique_ptr<const char[]> grapheme(
res_it->GetUTF8Text(RIL_SYMBOL));
if (grapheme && grapheme[0] != 0) {
if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
<< left << " " << top << " " << right << " " << bottom
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
}
hocr_str << HOcrEscape(grapheme.get()).c_str();
if (hocr_boxes) {
hocr_str << "</span>";
}
}
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));

View File

@ -275,6 +275,8 @@ Tesseract::Tesseract()
"Mark v.bad words for tilde crunch", this->params()),
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
this->params()),
BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
this->params()),
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
this->params()),
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,

View File

@ -967,6 +967,8 @@ class Tesseract : public Wordrec {
"Mark v.bad words for tilde crunch");
BOOL_VAR_H(hocr_font_info, false,
"Add font info to hocr output");
BOOL_VAR_H(hocr_char_boxes, false,
"Add coordinates for each character to hocr output");
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");