Merge pull request #2432 from nickjwhite/hocrmoretypes

Add different classes to hocr output depending on BlockType
This commit is contained in:
zdenop 2019-05-16 17:02:48 +02:00 committed by GitHub
commit 7e9d2f4bc4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -209,8 +209,21 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
hocr_str << "\n <span class='ocr_line'"
<< " id='"
hocr_str << "\n <span class='";
switch (res_it->BlockType()) {
case PT_HEADING_TEXT:
hocr_str << "ocr_header";
break;
case PT_PULLOUT_TEXT:
hocr_str << "ocr_textfloat";
break;
case PT_CAPTION_TEXT:
hocr_str << "ocr_caption";
break;
default:
hocr_str << "ocr_line";
}
hocr_str << "' id='"
<< "line_" << page_id << "_" << lcnt << "'";
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
}