From 134ebc3df39cc2e144eb11575149354bf347def9 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Mon, 15 Feb 2016 18:02:46 -0500 Subject: [PATCH] INCOMPATIBLE fix to hOCR line height information - fixes #225. This fixes the duplicate line IDs caused by inserting height information into the middle of the ID and it moves the line height info into the title attribute like everything else, rather than using non-standard HTML attributes (which won't validate). This change may break consumers of the HTML output, but 3.04 has only been in the wild for 6 months and the current HTML is invalid, so I believe the benefit outweighs the cost for the fix. --- api/baseapi.cpp | 67 +++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 521359b8..e35c425b 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1378,18 +1378,42 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it, hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0); } -static void AddBoxTohOCR(const PageIterator *it, +static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) { + unsigned long bufsize = base.length() + 2 * kMaxIntSize; + char id_buffer[bufsize]; + if (num2 >= 0) { + snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2); + } else { + snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1); + } + id_buffer[bufsize - 1] = '\0'; + *hocr_str += " id='"; + *hocr_str += id_buffer; + *hocr_str += "'"; +} + +static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level, STRING* hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); - hocr_str->add_str_int("' title=\"bbox ", left); + // This is the only place we use double quotes instead of single quotes, + // but it may too late to change for consistency + hocr_str->add_str_int(" title=\"bbox ", left); hocr_str->add_str_int(" ", top); hocr_str->add_str_int(" ", right); hocr_str->add_str_int(" ", bottom); - // Add baseline coordinates for textlines only. - if (level == RIL_TEXTLINE) + // Add baseline coordinates & heights for textlines only. + if (level == RIL_TEXTLINE) { AddBaselineCoordsTohOCR(it, level, hocr_str); + // add custom height measures + float row_height, descenders, ascenders; // row attributes + it->RowAttributes(&row_height, &descenders, &ascenders); + // TODO: Do we want to limit these to a single decimal place? + hocr_str->add_str_double("; x_size ", row_height); + hocr_str->add_str_double("; x_descenders ", descenders * -1); + hocr_str->add_str_double("; x_ascenders ", ascenders); + } *hocr_str += "\">"; } @@ -1408,7 +1432,6 @@ char* TessBaseAPI::GetHOCRText(int page_number) { int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. - float row_height, descenders, ascenders; // row attributes bool font_info = false; GetBoolVariable("hocr_font_info", &font_info); @@ -1434,8 +1457,9 @@ char* TessBaseAPI::GetHOCRText(int page_number) { delete[] utf8_str; #endif - hocr_str.add_str_int("
string()); } else { @@ -1457,36 +1481,29 @@ char* TessBaseAPI::GetHOCRText(int page_number) { // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - hocr_str.add_str_int("
IsAtBeginningOf(RIL_PARA)) { + hocr_str += "\n

ParagraphIsLtr()) { - hocr_str.add_str_int("\n

IsAtBeginningOf(RIL_TEXTLINE)) { - int fontsize; - hocr_str.add_str_int("\n WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); - hocr_str.add_str_int("' title='bbox ", left); + hocr_str.add_str_int(" title='bbox ", left); hocr_str.add_str_int(" ", top); hocr_str.add_str_int(" ", right); hocr_str.add_str_int(" ", bottom);