mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 11:09:06 +08:00
Merge pull request #310 from nickjwhite/hocrcharboxes
Character boxes in hOCR output
This commit is contained in:
commit
f75b2c1948
@ -135,7 +135,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
|
||||
bool para_is_ltr = true; // Default direction is LTR
|
||||
const char* paragraph_lang = nullptr;
|
||||
bool font_info = false;
|
||||
bool hocr_boxes = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
|
||||
|
||||
if (input_file_ == nullptr) SetInputName(nullptr);
|
||||
|
||||
@ -264,7 +266,16 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
|
||||
const std::unique_ptr<const char[]> grapheme(
|
||||
res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
|
||||
<< left << " " << top << " " << right << " " << bottom
|
||||
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
|
||||
}
|
||||
hocr_str << HOcrEscape(grapheme.get()).c_str();
|
||||
if (hocr_boxes) {
|
||||
hocr_str << "</span>";
|
||||
}
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
|
@ -275,6 +275,8 @@ Tesseract::Tesseract()
|
||||
"Mark v.bad words for tilde crunch", this->params()),
|
||||
BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
|
||||
this->params()),
|
||||
BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
|
||||
this->params()),
|
||||
BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
|
||||
this->params()),
|
||||
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
|
||||
|
@ -967,6 +967,8 @@ class Tesseract : public Wordrec {
|
||||
"Mark v.bad words for tilde crunch");
|
||||
BOOL_VAR_H(hocr_font_info, false,
|
||||
"Add font info to hocr output");
|
||||
BOOL_VAR_H(hocr_char_boxes, false,
|
||||
"Add coordinates for each character to hocr output");
|
||||
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
|
||||
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
|
||||
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
|
||||
|
Loading…
Reference in New Issue
Block a user