From db9c7e0312cbfc7b71d4ca968bc0ac48273d676c Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 16 Dec 2018 19:41:06 +0100 Subject: [PATCH] Use std::stringstream to generate hOCR output Using std::stringstream simplifies the code and allows conversion of double to string independant of the current locale setting. Signed-off-by: Stefan Weil --- src/api/hocrrenderer.cpp | 220 +++++++++++++++++---------------------- 1 file changed, 93 insertions(+), 127 deletions(-) diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index f60ac804..583c1f08 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -17,7 +17,9 @@ * **********************************************************************/ +#include // for std::locale::classic #include // for std::unique_ptr +#include // for std::stringstream #include "baseapi.h" // for TessBaseAPI #include "renderer.h" #include "tesseractclass.h" // for Tesseract @@ -46,10 +48,11 @@ static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) { * direction and does not add any baseline information to the hocr string. */ static void AddBaselineCoordsTohOCR(const PageIterator* it, - PageIteratorLevel level, STRING* hocr_str) { + PageIteratorLevel level, + std::stringstream& hocr_str) { tesseract::Orientation orientation = GetBlockTextOrientation(it); if (orientation != ORIENTATION_PAGE_UP) { - hocr_str->add_str_int("; textangle ", 360 - orientation * 90); + hocr_str << "; textangle " << 360 - orientation * 90; return; } @@ -69,56 +72,25 @@ static void AddBaselineCoordsTohOCR(const PageIterator* it, // Now fit a line through the points so we can extract coefficients for the // equation: y = p1 x + p0 - double p1 = 0; - double p0 = 0; if (x1 == x2) { // Problem computing the polynomial coefficients. return; } - p1 = (y2 - y1) / static_cast(x2 - x1); - p0 = y1 - static_cast(p1 * x1); + double p1 = (y2 - y1) / static_cast(x2 - x1); + double p0 = y1 - p1 * x1; - hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0); - hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0); -} - -static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, - int num2) { - const size_t BUFSIZE = 64; - char id_buffer[BUFSIZE]; - if (num2 >= 0) { - snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2); - } else { - snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); - } - id_buffer[BUFSIZE - 1] = '\0'; - *hocr_str += " id='"; - *hocr_str += id_buffer; - *hocr_str += "'"; -} - -static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, - int num2, int num3) { - const size_t BUFSIZE = 64; - char id_buffer[BUFSIZE]; - snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2, - num3); - id_buffer[BUFSIZE - 1] = '\0'; - *hocr_str += " id='"; - *hocr_str += id_buffer; - *hocr_str += "'"; + hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " + << round(p0 * 1000.0) / 1000.0; } static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, - STRING* hocr_str) { + std::stringstream& hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); // This is the only place we use double quotes instead of single quotes, // but it may too late to change for consistency - hocr_str->add_str_int(" title=\"bbox ", left); - hocr_str->add_str_int(" ", top); - hocr_str->add_str_int(" ", right); - hocr_str->add_str_int(" ", bottom); + hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " + << bottom; // Add baseline coordinates & heights for textlines only. if (level == RIL_TEXTLINE) { AddBaselineCoordsTohOCR(it, level, hocr_str); @@ -126,11 +98,10 @@ static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, float row_height, descenders, ascenders; // row attributes it->RowAttributes(&row_height, &descenders, &ascenders); // TODO(rays): Do we want to limit these to a single decimal place? - hocr_str->add_str_double("; x_size ", row_height); - hocr_str->add_str_double("; x_descenders ", descenders * -1); - hocr_str->add_str_double("; x_ascenders ", ascenders); + hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders + << "; x_ascenders " << ascenders; } - *hocr_str += "\">"; + hocr_str << "\">"; } /** @@ -166,8 +137,6 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { bool font_info = false; GetBoolVariable("hocr_font_info", &font_info); - STRING hocr_str(""); - if (input_file_ == nullptr) SetInputName(nullptr); #ifdef _WIN32 @@ -187,22 +156,25 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { delete[] utf8_str; #endif - hocr_str += "
\n"; + hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " + << rect_width_ << " " << rect_height_ << "; ppageno " << page_number + << "'>\n"; - ResultIterator* res_it = GetIterator(); + std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); @@ -212,29 +184,30 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { para_is_ltr = true; // reset to default direction - hocr_str += "
IsAtBeginningOf(RIL_PARA)) { - hocr_str += "\n

ParagraphIsLtr(); if (!para_is_ltr) { - hocr_str += " dir='rtl'"; + hocr_str << " dir='rtl'"; } - AddIdTohOCR(&hocr_str, "par", page_id, pcnt); + hocr_str << " id='" + << "par_" << page_id << "_" << pcnt << "'"; paragraph_lang = res_it->WordRecognitionLanguage(); if (paragraph_lang) { - hocr_str += " lang='"; - hocr_str += paragraph_lang; - hocr_str += "'"; + hocr_str << " lang='" << paragraph_lang << "'"; } - AddBoxTohOCR(res_it, RIL_PARA, &hocr_str); + AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { - hocr_str += "\n lstm_choice_mode) { confidencemap = res_it->GetBestLSTMSymbolChoices(); } - hocr_str += "\n WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); - hocr_str.add_str_int(" title='bbox ", left); - hocr_str.add_str_int(" ", top); - hocr_str.add_str_int(" ", right); - hocr_str.add_str_int(" ", bottom); - hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD)); + hocr_str << " title='bbox " << left << " " << top << " " << right << " " + << bottom << "; x_wconf " + << static_cast(res_it->Confidence(RIL_WORD)); if (font_info) { if (font_name) { - hocr_str += "; x_font "; - hocr_str += HOcrEscape(font_name); + hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); } - hocr_str.add_str_int("; x_fsize ", pointsize); + hocr_str << "; x_fsize " << pointsize; } - hocr_str += "'"; + hocr_str << "'"; const char* lang = res_it->WordRecognitionLanguage(); if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { - hocr_str += " lang='"; - hocr_str += lang; - hocr_str += "'"; + hocr_str << " lang='" << lang << "'"; } switch (res_it->WordDirection()) { // Only emit direction if different from current paragraph direction case DIR_LEFT_TO_RIGHT: - if (!para_is_ltr) hocr_str += " dir='ltr'"; + if (!para_is_ltr) hocr_str << " dir='ltr'"; break; case DIR_RIGHT_TO_LEFT: - if (para_is_ltr) hocr_str += " dir='rtl'"; + if (para_is_ltr) hocr_str << " dir='rtl'"; break; case DIR_MIX: case DIR_NEUTRAL: default: // Do nothing. break; } - hocr_str += ">"; + hocr_str << ">"; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - if (bold) hocr_str += ""; - if (italic) hocr_str += ""; + if (bold) hocr_str << ""; + if (italic) hocr_str << ""; do { const std::unique_ptr grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { - hocr_str += HOcrEscape(grapheme.get()); + hocr_str << HOcrEscape(grapheme.get()).c_str(); } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - if (italic) hocr_str += ""; - if (bold) hocr_str += ""; + if (italic) hocr_str << ""; + if (bold) hocr_str << ""; // If the lstm choice mode is required it is added here if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr) { for (size_t i = 0; i < confidencemap->size(); i++) { - hocr_str += "\n "; std::vector> timestep = (*confidencemap)[i]; for (std::pair conf : timestep) { - hocr_str += "" + << conf.first << ""; gcnt++; } - hocr_str += ""; + hocr_str << ""; tcnt++; } } else if (tesseract_->lstm_choice_mode == 2 && confidencemap != nullptr) { @@ -327,52 +295,50 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { std::vector> timestep = (*confidencemap)[i]; if (timestep.size() > 0) { - hocr_str += "\n "; for (size_t j = 1; j < timestep.size(); j++) { - hocr_str += "" << timestep[j].first << ""; gcnt++; } - hocr_str += ""; + hocr_str << ""; tcnt++; } } } - hocr_str += ""; + hocr_str << ""; tcnt = 1; gcnt = 1; wcnt++; // Close any ending block/paragraph/textline. if (last_word_in_line) { - hocr_str += "\n "; + hocr_str << "\n "; lcnt++; } if (last_word_in_para) { - hocr_str += "\n

\n"; + hocr_str << "\n

\n"; pcnt++; para_is_ltr = true; // back to default direction } if (last_word_in_block) { - hocr_str += "
\n"; + hocr_str << "
\n"; bcnt++; } } - hocr_str += " \n"; + hocr_str << " \n"; - char* ret = new char[hocr_str.length() + 1]; - strcpy(ret, hocr_str.string()); - delete res_it; - return ret; + const std::string& text = hocr_str.str(); + char* result = new char[text.length() + 1]; + strcpy(result, text.c_str()); + return result; } /**********************************************************************